From 82f476bd18bea6034d087e0103eff5617d975fe1 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Thu, 8 Nov 2018 13:16:13 +0000
Subject: [PATCH 01/37] Re-enable truly_random optimizer

- Task-migration still fails but much less frequently
---
 src/optimizer.cpp | 12 +++++++++++-
 src/scheduler.cpp | 14 ++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/src/optimizer.cpp b/src/optimizer.cpp
index 24419b2..e54f678 100644
--- a/src/optimizer.cpp
+++ b/src/optimizer.cpp
@@ -453,7 +453,7 @@ hpx::future<void> global_optimizer::decide_random_mapping(const std::vector<std:
         #ifdef TRULY_RANDOM_DEBUG
         std::cerr << "Will exclude " << how_many_to_exclude << " out of " << num_localities << std::endl;
         #endif
-
+        #if 1
         for (auto i=0ul; i<how_many_to_exclude; ++i) {
             auto new_exclude = get_random_node();
             exclude.push_back(new_exclude);
@@ -462,6 +462,15 @@ hpx::future<void> global_optimizer::decide_random_mapping(const std::vector<std:
             std::cerr << "Excluded: " << new_exclude << std::endl;
             #endif
         }
+        #else
+        for ( auto i=num_localities-how_many_to_exclude; i<num_localities; ++i) {
+            exclude.push_back(num_localities-i-1);
+
+            #ifdef TRULY_RANDOM_DEBUG
+            std::cerr << "Excluded: " << i << std::endl;
+            #endif
+        }
+        #endif
     }
 
     u_steps_till_rebalance = u_balance_every;
@@ -562,6 +571,7 @@ hpx::future<void> global_optimizer::balance_ino(const std::vector<std::size_t> &
                     #ifdef INO_DEBUG_DECIDE_SCHEDULE
                     std::cerr << "Ino picked a schedule" << std::endl;
                     #endif
+
                     for (auto node_wis : ino_schedule)
                         for (auto wi : node_wis.second.v_work_items)
                             new_mapping[wi] = node_wis.first;
diff --git a/src/scheduler.cpp b/src/scheduler.cpp
index 43d9605..662151c 100644
--- a/src/scheduler.cpp
+++ b/src/scheduler.cpp
@@ -188,6 +188,8 @@ namespace allscale
                         return "ino";
                     case random:
                         return "random";
+                    case truly_random:
+                        return "truly_random";
                     default:
                         return "unknown";
                 }
@@ -224,6 +226,13 @@ namespace allscale
                     tree_scheduling_policy::create_uniform(allscale::get_num_localities())
                 };
             }
+            if (policy == "truly_random")
+            {
+                return {
+                    replacable_policy::truly_random,
+                    tree_scheduling_policy::create_uniform(allscale::get_num_localities())
+                };
+            }
             if (policy == "random")
             {
                 return {
@@ -394,6 +403,11 @@ namespace allscale
                 optimizer_.balance_ino(old.task_distribution_mapping());
             }
 
+            if (policy_.value_ == replacable_policy::truly_random) {
+                tree_scheduling_policy const& old = static_cast<tree_scheduling_policy const&>(*policy_.policy_);
+                optimizer_.decide_random_mapping(old.task_distribution_mapping());
+            }
+
             return true;
         }
 

From 7ba05c445d1a4047ebf4b31fef5b5af196213cb7 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Fri, 9 Nov 2018 13:58:45 +0000
Subject: [PATCH 02/37] Power instead of Energy for INO, plus some integration
 with Dashboard

- Dashboard displays power as a fraction of the current power
  consumption over the maximum power consumption.
- It uses a model to generate these values. That can be inaccurate.
- This commit, reads the real power consumption but does not supply
  a maximum power consumption.
---
 src/components/localoptimizer.cpp    |  1 -
 src/components/monitor_component.cpp | 36 +++++++++++++++++++++++++++-
 src/dashboard.cpp                    |  4 +++-
 src/optimizer.cpp                    | 11 ++++-----
 4 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index 86faa91..1fae44c 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -19,7 +19,6 @@
 // only meant to be defined if one needs to measure the efficacy
 // of the scheduler
 //#define ALLSCALE_HAVE_CPUFREQ 1
-#define ALLSCALE_USE_CORE_OFFLINING 1
 
 namespace allscale {
 namespace components {
diff --git a/src/components/monitor_component.cpp b/src/components/monitor_component.cpp
index 667046b..947bac4 100644
--- a/src/components/monitor_component.cpp
+++ b/src/components/monitor_component.cpp
@@ -26,6 +26,11 @@
 
 #include <hpx/lcos/gather.hpp>
 
+#ifdef ALLSCALE_HAVE_CPUFREQ
+#define POWER_MEASUREMENT_PERIOD_MS 100
+#include <allscale/util/hardware_reconf.hpp>
+#endif
+
 #ifdef HAVE_PAPI
 #include <boost/tokenizer.hpp>
 #include <string.h>
@@ -329,13 +334,42 @@ namespace allscale { namespace components {
 
    float monitor::get_current_power()
    {
+#ifdef ALLSCALE_HAVE_CPUFREQ
+      /*VV: Read potentially multiple measurements of power within the span of 
+            POWER_MEASUREMENT_PERIOD_MS milliseconds. Each time this function
+            is invoked it returns the running average of power.*/
+      static unsigned long long times_read_power=1;
+      static unsigned long long power_sum = util::hardware_reconf::read_system_power();
+
+      static long timestamp_reset_power = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+      
+      long t_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+      auto dt = t_now - timestamp_reset_power;
+      times_read_power ++;
+
+      power_sum += util::hardware_reconf::read_system_power();
+
+      float ret = power_sum / (float)(times_read_power);
+
+      if ( dt >= POWER_MEASUREMENT_PERIOD_MS ) {
+            times_read_power = 0;
+            power_sum = 0ull;
+            timestamp_reset_power = t_now;
+      }
+
+      return ret;
+#else
       return allscale::power::estimate_power(get_current_freq(0)) * num_cpus_;
+#endif
    }
 
 
    float monitor::get_max_power()
    {
-#ifdef POWER_ESTIMATE
+#if defined(ALLSCALE_HAVE_CPUFREQ)
+      return 0.0;
+#elif defined(POWER_ESTIMATE)
       return allscale::power::estimate_power(get_max_freq(0)) * num_cpus_;
 #else
       return 0.0;
diff --git a/src/dashboard.cpp b/src/dashboard.cpp
index 0adc50f..ed528c1 100644
--- a/src/dashboard.cpp
+++ b/src/dashboard.cpp
@@ -60,7 +60,9 @@ namespace allscale { namespace dashboard
         state.speed = 1.f - state.idle_rate;
         state.efficiency = state.speed * ((state.cur_frequency * active_cores) / (state.max_frequency * state.num_cores));
 
-#ifdef POWER_ESTIMATE
+#ifdef ALLSCALE_HAVE_CPUFREQ
+        state.power = monitor_c->get_current_power();
+#elif defined(POWER_ESTIMATE)
         state.cur_power = monitor_c->get_current_power();
         state.max_power = monitor_c->get_max_power();
         state.power = state.cur_power / state.max_power;
diff --git a/src/optimizer.cpp b/src/optimizer.cpp
index e54f678..d35ed9a 100644
--- a/src/optimizer.cpp
+++ b/src/optimizer.cpp
@@ -29,7 +29,6 @@ namespace allscale
 {
     optimizer_state get_optimizer_state()
     {
-        static float last_energy = 0.f;
         float load = 1.f - monitor::get().get_idle_rate();
         float my_time = monitor::get().get_avg_time_last_iterations(HISTORY_ITERATIONS);
 
@@ -37,16 +36,16 @@ namespace allscale
             my_time = -1.f;
 
         allscale::components::monitor *monitor_c = &allscale::monitor::get();
-        float energy = 100.f;
-#ifdef POWER_ESTIMATE
-        energy = monitor_c->get_current_power();
+        float power_now = 100.f;
+#if defined(POWER_ESTIMATE) || defined(ALLSCALE_HAVE_CPUFREQ)
+        power_now = monitor_c->get_current_power();
 #endif
-
+        // VV: Use power as if it were energy
         return {
             load,
             monitor::get().get_task_times(),
             my_time,
-            energy,
+            power_now,
             float(monitor_c->get_current_freq(0)),
             scheduler::get().get_active_threads()
         };

From 3b2d71fa7a29c9468f506344183f1e4104cb53c4 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Mon, 12 Nov 2018 10:57:00 +0000
Subject: [PATCH 03/37] Patching up intra-node optimizer

---
 allscale/components/localoptimizer.hpp | 15 ++++---
 src/components/localoptimizer.cpp      | 56 +++++++++++++++++---------
 src/components/nmsimplex_bbincr.cpp    | 12 +++---
 src/components/scheduler_component.cpp | 47 ++++++++-------------
 4 files changed, 69 insertions(+), 61 deletions(-)

diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index c0e588a..1f7aae0 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -15,7 +15,8 @@
 
 //#define MEASURE_MANUAL_ 1
 #define MEASURE_ 1
-//#define DEBUG_ 1
+#define DEBUG_ 1
+#define DEBUG_MULTIOBJECTIVE_ 1
 
 namespace allscale { namespace components {
 
@@ -71,7 +72,8 @@ namespace allscale { namespace components {
         /* index to the global cpu-supported frequencies vector pointing to
            the new frequency to be set. If set to -1, frequency will stay
            unchanged */
-       unsigned int frequency_idx;
+       int frequency_idx;
+       int previous_frequency_idx;
 #endif
     };
 
@@ -124,6 +126,9 @@ namespace allscale { namespace components {
             return frequencies_param_allowed_;
         }
 #endif
+        std::size_t getmaxthreads() {
+            return max_threads_;
+        }
 
         void setmaxthreads(std::size_t threads){
             max_threads_=threads;
@@ -197,12 +202,6 @@ namespace allscale { namespace components {
         /* vector containing sorted list of frequencies supported by the
            processor */
         std::vector<unsigned long> frequencies_param_allowed_;
-
-        /* index to the vector of allowed frequencies that points to the highest
-           frequency. The ordering of the vector, as reported by hardware
-           reconfiguration can be platform specific, and therefore we need this
-           index to make sorted access to the vector platform agnostic */
-        const short unsigned int highest_frequency_allowed_idx_ = 0;
 #endif
 
         /* threshold (percentage in [0,1]) to decide convergence of optimization
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index 1fae44c..0d5d59b 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -10,9 +10,9 @@
 #include <stdlib.h>
 #include <stdexcept>
 
-//#define DEBUG_ 1
+#define DEBUG_ 1
 //#define DEBUG_INIT_ 1 // define to generate output during scheduler initialization
-//#define DEBUG_MULTIOBJECTIVE_ 1
+#define DEBUG_MULTIOBJECTIVE_ 1
 //#define DEBUG_CONVERGENCE_ 1
 //#define MEASURE_MANUAL 1 // define to generate output consumed by the regression test
 #define MEASURE_ 1
@@ -109,6 +109,9 @@ void localoptimizer::printobjectives(){
 }
 
 void localoptimizer::printverbosesteps(actuation act){
+  static int last_frequency_idx = 0;
+
+
   std::cout << "[INFO]";
   if (optmethod_==random)
     std::cout << "Random ";
@@ -116,18 +119,24 @@ void localoptimizer::printverbosesteps(actuation act){
     std::cout << "Allscale ";
   }
   std::cout << "Scheduler Step: Setting OS Threads to " << threads_param_;
-#ifdef ALLSCALE_HAVE_CPUFREQ
-  std::cout << ", CPU Frequency to " << frequencies_param_allowed_[act.frequency_idx]
+  #ifdef ALLSCALE_HAVE_CPUFREQ
+  if ( act.frequency_idx >= 0 )
+    last_frequency_idx = act.frequency_idx;
+  std::cout << " , CPU Frequency to " << frequencies_param_allowed_[last_frequency_idx]
     << std::endl;
 #else
   std::cout << std::endl;
 #endif
-
 }
 
 #endif
 
 void localoptimizer::measureObjective(double iter_time, double power, double threads){
+  std::cout <<"Measuring objective: " 
+            << iter_time << " " 
+            << power << " " 
+            << threads << std::endl;
+  
   for(auto& el: objectives_){
     switch (el.type){
       case time:
@@ -235,18 +244,18 @@ actuation localoptimizer::step()
     /* random optimization step */
     if (optmethod_ == random)
     {
-        act.delta_threads = (rand() % max_threads_) - threads_param_;
+        act.delta_threads = (rand() % max_threads_);
 #ifdef ALLSCALE_HAVE_CPUFREQ
         act.frequency_idx = rand() % frequencies_param_allowed_.size();
-        if (act.frequency_idx == frequency_param_)
-            act.frequency_idx = -1;
+        // if (act.frequency_idx == frequency_param_)
+        //     act.frequency_idx = -1;
 #endif
     }
 
     else if (optmethod_ == allscale)
     {
         if (current_objective_idx_ > objectives_.size())
-  	    	return act;
+  	      goto validate_act;
 
         if (steps_ < warmup_steps_)
         {
@@ -260,7 +269,7 @@ actuation localoptimizer::step()
 #ifdef ALLSCALE_HAVE_CPUFREQ
     	    act.frequency_idx = rand() % frequencies_param_allowed_.size();
 #endif
-            return act;
+        goto validate_act;
         }
 
         // iterate over all objectives in decreasing priority
@@ -305,7 +314,7 @@ actuation localoptimizer::step()
             double constraint_min[]={1,0};
             double constraint_max[]={(double)max_threads_,
                 (double)frequencies_param_allowed_.size()};
-
+            std::cout << "initialize_simplex::Initializing with " << frequencies_param_allowed_.size() << " frequencies" << std::endl;
             nmd.initialize_simplex(params,values,constraint_min,constraint_max);
             objectives_[current_objective_idx_].initialized=true;
 #endif
@@ -420,8 +429,6 @@ actuation localoptimizer::step()
                         act.frequency_idx = (int)priority_obj.minimization_params[1]*
                             (max_leeway_value/priority_obj.converged_minimum);
 #endif
-                        //act.delta_threads=minimization_point[0];
-			            //act.frequency_idx=minimization_point[1];
 			            current_objective_idx_++;
 			            if (current_objective_idx_ == objectives_.size())
                         {
@@ -430,15 +437,28 @@ actuation localoptimizer::step()
                             std::cout << "[LOCALOPTIMIZER|INFO] ALL OBJECTIVES HAVE CONVERGED " << std::endl;
 #endif
                         }
-                        return act;
-                    }
-    		}
-            act.delta_threads=(nmd_res.threads==0)?getCurrentThreads():nmd_res.threads;
+                        act.delta_threads=(nmd_res.threads==0)?getCurrentThreads():nmd_res.threads;
 #ifdef ALLSCALE_HAVE_CPUFREQ
-            act.frequency_idx=nmd_res.freq_idx;
+                        act.frequency_idx=nmd_res.freq_idx;
 #endif
+
+                        goto validate_act;
+                    }
+    		}
         }
     }
+    validate_act:
+
+    if ( act.delta_threads > max_threads_) {
+      act.delta_threads = max_threads_;
+    } else if ( act.delta_threads < 1 ) {
+      act.delta_threads = getCurrentThreads();
+    }
+#ifdef ALLSCALE_HAVE_CPUFREQ
+    // VV: If freq_idx is -1 then set it to last used frequency (frequency_param_)
+    if ( act.frequency_idx < 0)
+      act.frequency_idx= frequency_param_;
+#endif
     return act;
 }
 }
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 97736cd..a0c964e 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -11,8 +11,8 @@
  */
 
 #include <allscale/components/nmsimplex_bbincr.hpp>
-//#define NMD_DEBUG_ 1
-//#define NMD_INFO_ 1
+#define NMD_DEBUG_ 1
+#define NMD_INFO_ 1
 
 /* create the initial simplex
 
@@ -212,11 +212,13 @@ optstepresult NelderMead::step(double param)
       for (j=0;j<=n-1;j++) {
         /*vr[j] = (1+ALPHA)*vm[j] - ALPHA*v[vg][j];*/
         /*
-        std::cout << "vm[" << j << "]=" << vm[j] << std::endl;
-        std::cout << "v[vg" << j << "]=" << v[vg][j] << std::endl;
-        std::cout << "ALPHA=" << ALPHA << std::endl;
         */
         vr[j] = vm[j]+ALPHA*(vm[j]-v[vg][j]);
+
+        // std::cout << "vm[" << j << "]=" << vm[j] << std::endl;
+        // std::cout << "v[vg" << j << "]=" << v[vg][j] << std::endl;
+        // std::cout << "ALPHA=" << ALPHA << std::endl;
+        // std::cout << "Vr[" << j << "]=" << vr[j] << std::endl;
       }
       my_constraints(vr);
 #ifdef NMD_DEBUG_
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 7185b23..678d539 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -22,7 +22,7 @@
 
 //#define DEBUG_ 1
 //#define DEBUG_INIT_ 1 // define to generate output during scheduler initialization
-//#define DEBUG_MULTIOBJECTIVE_ 1
+#define DEBUG_MULTIOBJECTIVE_ 1
 //#define DEBUG_THREADTHROTTLING_ 1
 //#define DEBUG_THREADSTATUS_ 1
 //#define DEBUG_FREQSCALING_ 1
@@ -719,7 +719,6 @@ void scheduler::optimize_locally(work_item const& work)
         /* Count Active threads for validation*/
 
         hpx::threads::mask_type active_mask;
-        std::size_t active_threads_ = 0;
         std::size_t domain_active_threads = 0;
         std::size_t pool_idx = 0;
         int total_threads_counted=0;
@@ -741,14 +740,13 @@ void scheduler::optimize_locally(work_item const& work)
 #ifdef MEASURE_
 #ifdef ALLSCALE_HAVE_CPUFREQ
         std::size_t temp_id = work.id().id;
-        if ((temp_id >= period_for_power) &&
-                (temp_id % period_for_power == 0))
+        if ((temp_id >= period_for_power) && (temp_id % period_for_power == 0))
             update_power_consumption(hardware_reconf::read_system_power());
 #endif
 #endif
 
 #ifdef ALLSCALE_HAVE_CPUFREQ
-        if (uselopt && !lopt_.isConverged()){
+        if (uselopt && !lopt_.isConverged()) {
             last_power_usage++;
             current_power_usage = hardware_reconf::read_system_power();
             power_sum += current_power_usage;
@@ -775,7 +773,8 @@ void scheduler::optimize_locally(work_item const& work)
                 }
 
                 lopt_.measureObjective(current_avg_iter_time,power_sum/last_power_usage,
-                        active_threads);
+                // active_threads
+                        lopt_.getCurrentThreads());
                 last_power_usage=0;
                 power_sum=0;
             }
@@ -790,39 +789,27 @@ void scheduler::optimize_locally(work_item const& work)
                 lopt_.printverbosesteps(act_temp);
 #endif
                 // amend threads if signaled
-                /*
-                if (act_temp.delta_threads<0){
-                    unsigned int suspended_temp =
-                        suspend_threads(-1 * act_temp.delta_threads);
-                    lopt_.setCurrentThreads(lopt_.getCurrentThreads()-suspended_temp);
-                }
-                else if (act_temp.delta_threads>0){
-                    unsigned int resumed_temp =
-                        resume_threads(act_temp.delta_threads);
-                    lopt_.setCurrentThreads(lopt_.getCurrentThreads()+resumed_temp);
-                }
-                */
-
+                
                 if (act_temp.delta_threads < active_threads){
 #ifdef DEBUG_MULTIOBJECTIVE_
-                    int new_threads_target = (int)active_threads - act_temp.delta_threads;
-                    std::cout << "[SCHEDULER|INFO]: Optimizer induced threads to suspend: " << new_threads_target << std::endl;
-                    std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << ", target threads = " << act_temp.delta_threads << std::endl;
-#endif
+                    std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << " out of " << lopt_.getmaxthreads() 
+                    << " , target threads = " << act_temp.delta_threads << std::endl;
+
+#endif    
                     //unsigned int suspended_temp = suspend_threads(new_threads_target);
                     //lopt_.setCurrentThreads(lopt_.getCurrentThreads()-suspended_temp);
-
-                    lopt_.setCurrentThreads(active_threads);
+                    suspend_threads(active_threads-act_temp.delta_threads);
                 }
                 else if (act_temp.delta_threads > active_threads){
 #ifdef DEBUG_MULTIOBJECTIVE_
-                    int new_threads_target = act_temp.delta_threads - (int)active_threads;
-                    std::cout << "[SCHEDULER|INFO]: Optimizer induced threads to resume to: " << new_threads_target << std::endl;
-                    std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << ", target threads = " << act_temp.delta_threads << std::endl;
+                    std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << " out of " << lopt_.getmaxthreads() 
+                    << " , target threads = " << act_temp.delta_threads << std::endl;
 #endif
-                    fix_allcores_frequencies(act_temp.frequency_idx);
-                    lopt_.setCurrentFrequencyIdx(act_temp.frequency_idx);
+                    resume_threads(act_temp.delta_threads - active_threads);
                 }
+                fix_allcores_frequencies(act_temp.frequency_idx);
+                lopt_.setCurrentFrequencyIdx(act_temp.frequency_idx);
+                lopt_.setCurrentThreads(act_temp.delta_threads);
             }
         } // uselopt
 #endif

From a673f4344ac6e856f47be32f534b0b1e31e0bde6 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Tue, 13 Nov 2018 08:58:18 +0000
Subject: [PATCH 04/37] Optimize NelderMead

---
 allscale/components/nmsimplex_bbincr.hpp |  12 +-
 src/components/localoptimizer.cpp        | 685 +++++++++++----------
 src/components/nmsimplex_bbincr.cpp      | 744 ++++++++++++-----------
 src/components/scheduler_component.cpp   |  21 +-
 src/components/util/hardware_reconf.cpp  |   2 +
 5 files changed, 798 insertions(+), 666 deletions(-)

diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp
index f894d2b..ea4f3bd 100644
--- a/allscale/components/nmsimplex_bbincr.hpp
+++ b/allscale/components/nmsimplex_bbincr.hpp
@@ -30,6 +30,7 @@ namespace allscale { namespace components {
 #define ALPHA       1.0       /* reflection coefficient */
 #define BETA        0.5       /* contraction coefficient */
 #define GAMMA       2.0       /* expansion coefficient */
+#define DELTA       0.5        /* shrinking coefficient */
 
 /* structure type of a single optimization step return status */
 struct optstepresult{
@@ -42,7 +43,8 @@ struct optstepresult{
 };
 
 /* enumeration encoding state that the incremental Nelder Mead optimizer is at */
-enum iterationstates {start, reflection, expansion, contraction};
+enum iterationstates {start, reflection, expansion,
+                      contraction, shrink};
 
 class NelderMead {
 
@@ -63,9 +65,17 @@ class NelderMead {
     unsigned long int getIterations(){return itr;}
 
   private:
+
+    optstepresult do_step_start(double param);
+    optstepresult do_step_reflect(double param);
+    optstepresult do_step_expand(double param);
+    optstepresult do_step_contract(double param);
+    optstepresult do_step_shrink(double param);
+
     int vg_index();
     int vs_index();
     int vh_index();
+    void sort_vertices(void);
     void my_constraints(double*);
     void centroid();
     bool testConvergence();
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index 0d5d59b..593853b 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -13,117 +13,130 @@
 #define DEBUG_ 1
 //#define DEBUG_INIT_ 1 // define to generate output during scheduler initialization
 #define DEBUG_MULTIOBJECTIVE_ 1
-//#define DEBUG_CONVERGENCE_ 1
+#define DEBUG_CONVERGENCE_ 1
 //#define MEASURE_MANUAL 1 // define to generate output consumed by the regression test
 #define MEASURE_ 1
 // only meant to be defined if one needs to measure the efficacy
 // of the scheduler
 //#define ALLSCALE_HAVE_CPUFREQ 1
 
-namespace allscale {
-namespace components {
+namespace allscale
+{
+namespace components
+{
 
 localoptimizer::localoptimizer(std::list<objective> targetobjectives)
-  : objectives_((int)targetobjectives.size()),
-    nmd(0.01),
-    param_changes_(0),
-    steps_(0),
-    current_param_(thread),
-    converged_(false)
+    : objectives_((int)targetobjectives.size()),
+      nmd(convergence_threshold_),
+      param_changes_(0),
+      steps_(0),
+      current_param_(thread),
+      converged_(false)
+{
+  for (objective o : targetobjectives)
   {
-    for (objective o : targetobjectives) {
-      //std::cout << o.type << "," << o.leeway << "," << o.priority << '\n';
-      objectives_[o.priority] = o;
-      objectives_[o.priority].localmin=10000;
-      objectives_[o.priority].globalmin=10000;
-      objectives_[o.priority].localmax=0.0;
-      objectives_[o.priority].globalmax=0.0;
-      objectives_[o.priority].converged=false;
-      objectives_[o.priority].initialized=false;
-      objectives_[o.priority].min_params_idx=0;
-      objectives_[o.priority].converged_minimum=0;
-    }
+    //std::cout << o.type << "," << o.leeway << "," << o.priority << '\n';
+    objectives_[o.priority] = o;
+    objectives_[o.priority].localmin = 10000;
+    objectives_[o.priority].globalmin = 10000;
+    objectives_[o.priority].localmax = 0.0;
+    objectives_[o.priority].globalmax = 0.0;
+    objectives_[o.priority].converged = false;
+    objectives_[o.priority].initialized = false;
+    objectives_[o.priority].min_params_idx = 0;
+    objectives_[o.priority].converged_minimum = 0;
+  }
 #ifdef ALLSCALE_HAVE_CPUFREQ
-    setCurrentFrequencyIdx(0);
+  setCurrentFrequencyIdx(0);
 #endif
 };
 
-void localoptimizer::setobjectives(std::list<objective> targetobjectives){
+void localoptimizer::setobjectives(std::list<objective> targetobjectives)
+{
   objectives_.clear();
   objectives_.resize((int)targetobjectives.size());
-  for (objective o : targetobjectives) {
+  for (objective o : targetobjectives)
+  {
     //std::cout << o.type << "," << o.leeway << "," << o.priority << '\n';
     objectives_[o.priority] = o;
-    objectives_[o.priority].localmin=10000;
-    objectives_[o.priority].globalmin=10000;
-    objectives_[o.priority].localmax=0.0;
-    objectives_[o.priority].globalmax=0.0;
-    objectives_[o.priority].converged=false;
-    objectives_[o.priority].initialized=false;
-    objectives_[o.priority].min_params_idx=0;
-    objectives_[o.priority].converged_minimum=0;
+    objectives_[o.priority].localmin = 10000;
+    objectives_[o.priority].globalmin = 10000;
+    objectives_[o.priority].localmax = 0.0;
+    objectives_[o.priority].globalmax = 0.0;
+    objectives_[o.priority].converged = false;
+    objectives_[o.priority].initialized = false;
+    objectives_[o.priority].min_params_idx = 0;
+    objectives_[o.priority].converged_minimum = 0;
   }
-  steps_=0;
-  param_changes_=0;
-  current_param_=thread;
+  steps_ = 0;
+  param_changes_ = 0;
+  current_param_ = thread;
 #ifdef ALLSCALE_HAVE_CPUFREQ
   setCurrentFrequencyIdx(0);
 #endif
-  converged_=false;
+  converged_ = false;
 }
 
-void localoptimizer::reset(int threads, int freq_idx){
+void localoptimizer::reset(int threads, int freq_idx)
+{
   threads_param_ = threads;
-  param_changes_=0;
+  param_changes_ = 0;
   thread_param_values_.clear();
 #ifdef ALLSCALE_HAVE_CPUFREQ
-  frequency_param_= freq_idx;
+  frequency_param_ = freq_idx;
   frequency_param_values_.clear();
 #endif
-  current_objective_idx_=0;
-  steps_=0;
-  current_param_=thread;
-  converged_=false;
+  current_objective_idx_ = 0;
+  steps_ = 0;
+  current_param_ = thread;
+  converged_ = false;
 };
 
 #ifdef DEBUG_
-void localoptimizer::printobjectives(){
-  for(auto& el: objectives_){
-    std::cout << "Objective" << "\t\t" << "Priority" << "\t\t" << "Leeway" <<
-    std::endl;
-    switch (el.type){
-      case time:
-        std::cout << "Time" << "\t\t" << el.priority << "\t\t" << el.leeway <<
-        std::endl;
-        break;
-      case energy:
-        std::cout << "Energy" << "\t\t" << el.priority << "\t\t" << el.leeway <<
-        std::endl;
-        break;
-      case resource:
-        std::cout << "Resource" << "\t\t" << el.priority << "\t\t" << el.leeway <<
-        std::endl;
-        break;
+void localoptimizer::printobjectives()
+{
+  for (auto &el : objectives_)
+  {
+    std::cout << "Objective"
+              << "\t\t"
+              << "Priority"
+              << "\t\t"
+              << "Leeway" << std::endl;
+    switch (el.type)
+    {
+    case time:
+      std::cout << "Time"
+                << "\t\t" << el.priority << "\t\t" << el.leeway << std::endl;
+      break;
+    case energy:
+      std::cout << "Energy"
+                << "\t\t" << el.priority << "\t\t" << el.leeway << std::endl;
+      break;
+    case resource:
+      std::cout << "Resource"
+                << "\t\t" << el.priority << "\t\t" << el.leeway << std::endl;
+      break;
     }
   }
 }
 
-void localoptimizer::printverbosesteps(actuation act){
+void localoptimizer::printverbosesteps(actuation act)
+{
   static int last_frequency_idx = 0;
 
-
   std::cout << "[INFO]";
-  if (optmethod_==random)
+  if (optmethod_ == random)
     std::cout << "Random ";
-  else if (optmethod_==allscale){
+  else if (optmethod_ == allscale)
+  {
     std::cout << "Allscale ";
   }
   std::cout << "Scheduler Step: Setting OS Threads to " << threads_param_;
-  #ifdef ALLSCALE_HAVE_CPUFREQ
-  if ( act.frequency_idx >= 0 )
+#ifdef ALLSCALE_HAVE_CPUFREQ
+  if (act.frequency_idx >= 0)
     last_frequency_idx = act.frequency_idx;
   std::cout << " , CPU Frequency to " << frequencies_param_allowed_[last_frequency_idx]
-    << std::endl;
+            << std::endl;
 #else
   std::cout << std::endl;
 #endif
@@ -131,335 +144,367 @@ void localoptimizer::printverbosesteps(actuation act){
 
 #endif
 
-void localoptimizer::measureObjective(double iter_time, double power, double threads){
-  std::cout <<"Measuring objective: " 
-            << iter_time << " " 
-            << power << " " 
+void localoptimizer::measureObjective(double iter_time, double power, double threads)
+{
+  std::cout << "Measuring objective: "
+            << iter_time << " "
+            << power << " "
             << threads << std::endl;
-  
-  for(auto& el: objectives_){
-    switch (el.type){
-      case time:
-        el.samples.insert(el.samples.begin(),iter_time);
-        if (el.samples.size()>1000)
-          el.samples.resize(500);
-
-        el.threads_samples.insert(el.threads_samples.begin(),threads);
-        if (el.threads_samples.size()>1000)
-          el.threads_samples.resize(500);
+
+  for (auto &el : objectives_)
+  {
+    switch (el.type)
+    {
+    case time:
+      el.samples.insert(el.samples.begin(), iter_time);
+      if (el.samples.size() > 1000)
+        el.samples.resize(500);
+
+      el.threads_samples.insert(el.threads_samples.begin(), threads);
+      if (el.threads_samples.size() > 1000)
+        el.threads_samples.resize(500);
 
 #ifdef ALLSCALE_HAVE_CPUFREQ
-        el.freq_samples.insert(el.freq_samples.begin(),getCurrentFrequencyIdx());
-        if (el.freq_samples.size()>1000)
-          el.freq_samples.resize(500);
+      el.freq_samples.insert(el.freq_samples.begin(), getCurrentFrequencyIdx());
+      if (el.freq_samples.size() > 1000)
+        el.freq_samples.resize(500);
 #endif
 
-        if (el.globalmin > iter_time){
-          el.globalmin = iter_time;
-          el.min_params_idx=param_changes_;
-        }
-        if (el.globalmax < iter_time)
-          el.globalmax = iter_time;
+      if (el.globalmin > iter_time)
+      {
+        el.globalmin = iter_time;
+        el.min_params_idx = param_changes_;
+      }
+      if (el.globalmax < iter_time)
+        el.globalmax = iter_time;
 #ifdef DEBUG__
-        std::cout << "Iteration Time Minimum: " << el.globalmin << std::endl;
-        std::cout << "Iteration Time Maximum: " << el.globalmax << std::endl;
-        std::cout << "Iteration Time Samples: ";
-        for(auto& samp: el.samples)
-          std::cout << samp << ",";
-        std::cout << std::endl;
-#endif
-        break;
-      case energy:
-        el.samples.insert(el.samples.begin(),power);
-        if (el.samples.size()>1000)
-          el.samples.resize(500);
-
-        el.threads_samples.insert(el.threads_samples.begin(),threads);
-        if (el.threads_samples.size()>1000)
-          el.threads_samples.resize(500);
+      std::cout << "Iteration Time Minimum: " << el.globalmin << std::endl;
+      std::cout << "Iteration Time Maximum: " << el.globalmax << std::endl;
+      std::cout << "Iteration Time Samples: ";
+      for (auto &samp : el.samples)
+        std::cout << samp << ",";
+      std::cout << std::endl;
+#endif
+      break;
+    case energy:
+      el.samples.insert(el.samples.begin(), power);
+      if (el.samples.size() > 1000)
+        el.samples.resize(500);
+
+      el.threads_samples.insert(el.threads_samples.begin(), threads);
+      if (el.threads_samples.size() > 1000)
+        el.threads_samples.resize(500);
 
 #ifdef ALLSCALE_HAVE_CPUFREQ
-        el.freq_samples.insert(el.freq_samples.begin(),getCurrentFrequencyIdx());
-        if (el.freq_samples.size()>1000)
-          el.freq_samples.resize(500);
+      el.freq_samples.insert(el.freq_samples.begin(), getCurrentFrequencyIdx());
+      if (el.freq_samples.size() > 1000)
+        el.freq_samples.resize(500);
 #endif
 
-        if (el.globalmin > power){
-          el.globalmin = power;
-          el.min_params_idx=param_changes_;
-        }
-        if (el.globalmax < power)
-          el.globalmax = power;
+      if (el.globalmin > power)
+      {
+        el.globalmin = power;
+        el.min_params_idx = param_changes_;
+      }
+      if (el.globalmax < power)
+        el.globalmax = power;
 #ifdef DEBUG__
-        std::cout << "Power Consumption Minimum: " << el.globalmin << std::endl;
-        std::cout << "Power Consumption Maximum: " << el.globalmax << std::endl;
-        std::cout << "Power Consumption Samples: ";
-        for(auto& samp: el.samples)
-          std::cout << samp << ",";
-        std::cout << std::endl;
-#endif
-        break;
-      case resource:
-        el.samples.insert(el.samples.begin(),threads);
-        if (el.samples.size()>1000)
-          el.samples.resize(500);
-
-        el.threads_samples.insert(el.threads_samples.begin(),threads);
-        if (el.threads_samples.size()>1000)
-          el.threads_samples.resize(500);
+      std::cout << "Power Consumption Minimum: " << el.globalmin << std::endl;
+      std::cout << "Power Consumption Maximum: " << el.globalmax << std::endl;
+      std::cout << "Power Consumption Samples: ";
+      for (auto &samp : el.samples)
+        std::cout << samp << ",";
+      std::cout << std::endl;
+#endif
+      break;
+    case resource:
+      el.samples.insert(el.samples.begin(), threads);
+      if (el.samples.size() > 1000)
+        el.samples.resize(500);
+
+      el.threads_samples.insert(el.threads_samples.begin(), threads);
+      if (el.threads_samples.size() > 1000)
+        el.threads_samples.resize(500);
 
 #ifdef ALLSCALE_HAVE_CPUFREQ
-        el.freq_samples.insert(el.freq_samples.begin(),getCurrentFrequencyIdx());
-        if (el.freq_samples.size()>1000)
-          el.freq_samples.resize(500);
+      el.freq_samples.insert(el.freq_samples.begin(), getCurrentFrequencyIdx());
+      if (el.freq_samples.size() > 1000)
+        el.freq_samples.resize(500);
 #endif
 
-        if (el.globalmin > threads){
-          el.globalmin = threads;
-          el.min_params_idx=param_changes_;
-        }
-        if (el.globalmax < threads)
-          el.globalmax = threads;
+      if (el.globalmin > threads)
+      {
+        el.globalmin = threads;
+        el.min_params_idx = param_changes_;
+      }
+      if (el.globalmax < threads)
+        el.globalmax = threads;
 #ifdef DEBUG__
-        std::cout << "Threads Minimum: " << el.globalmin << std::endl;
-        std::cout << "Threads Maximum: " << el.globalmax << std::endl;
-        std::cout << "Threads Samples: ";
-        for(auto& samp: el.samples)
-          std::cout << samp << ",";
-        std::cout << std::endl;
-#endif
-        break;
+      std::cout << "Threads Minimum: " << el.globalmin << std::endl;
+      std::cout << "Threads Maximum: " << el.globalmax << std::endl;
+      std::cout << "Threads Samples: ";
+      for (auto &samp : el.samples)
+        std::cout << samp << ",";
+      std::cout << std::endl;
+#endif
+      break;
     }
   }
 }
 
 actuation localoptimizer::step()
 {
-    steps_++;
-    actuation act;
-    act.delta_threads=threads_param_;
+  steps_++;
+  actuation act;
+  act.delta_threads = threads_param_;
 #ifdef ALLSCALE_HAVE_CPUFREQ
-    act.frequency_idx=frequency_param_;
+  act.frequency_idx = frequency_param_;
 #endif
-    /* random optimization step */
-    if (optmethod_ == random)
-    {
-        act.delta_threads = (rand() % max_threads_);
+  /* random optimization step */
+  if (optmethod_ == random)
+  {
+    act.delta_threads = (rand() % max_threads_);
 #ifdef ALLSCALE_HAVE_CPUFREQ
-        act.frequency_idx = rand() % frequencies_param_allowed_.size();
-        // if (act.frequency_idx == frequency_param_)
-        //     act.frequency_idx = -1;
+    act.frequency_idx = rand() % frequencies_param_allowed_.size();
+    // if (act.frequency_idx == frequency_param_)
+    //     act.frequency_idx = -1;
 #endif
-    }
+  }
 
-    else if (optmethod_ == allscale)
-    {
-        if (current_objective_idx_ > objectives_.size())
-  	      goto validate_act;
+  else if (optmethod_ == allscale)
+  {
+    if (current_objective_idx_ > objectives_.size())
+      goto validate_act;
 
-        if (steps_ < warmup_steps_)
-        {
+    if (steps_ < warmup_steps_)
+    {
 
 #ifdef DEBUG_MULTIOBJECTIVE_
-            std::cout << "[LOCALOPTIMIZER|INFO] Optimizer No-OP: either at warm-up or optimizer has completed\n";
+      std::cout << "[LOCALOPTIMIZER|INFO] Optimizer No-OP: either at warm-up or optimizer has completed\n";
 #endif
-            // set some random parametrization to collect at least 3 different
-            // vertices to be used as input to the optimizer
-    	    act.delta_threads = rand() % max_threads_;
+      // set some random parametrization to collect at least 3 different
+      // vertices to be used as input to the optimizer
+
+#if 1
+      float bucket_dt = steps_ / (float)warmup_steps_;
+      float _min_threads = max_threads_ * bucket_dt;
+
+      act.delta_threads = rand() % (int)ceil(bucket_dt) + roundf(_min_threads);
 #ifdef ALLSCALE_HAVE_CPUFREQ
-    	    act.frequency_idx = rand() % frequencies_param_allowed_.size();
+      float _min_freqs = frequencies_param_allowed_.size() * bucket_dt;
+      act.frequency_idx = rand() % (int)ceil(bucket_dt) + roundf(_min_freqs);
 #endif
-        goto validate_act;
-        }
+#endif
+      goto validate_act;
+    }
 
-        // iterate over all objectives in decreasing priority
-        objective obj = objectives_[current_objective_idx_];
+    // iterate over all objectives in decreasing priority
+    objective obj = objectives_[current_objective_idx_];
 
-        // initialize optimizer for this objective, if not already done so
-        if (!obj.initialized)
-        {
+    // initialize optimizer for this objective, if not already done so
+    if (!obj.initialized)
+    {
 #ifdef DEBUG_MULTIOBJECTIVE_
-            std::cout << "[LOCALOPTIMIZER|INFO] Initializing optimizer for new objective\n";
-	        std::cout << "[LOCALOPTIMIZER|DEBUG] Samples: " << std::flush;
-	        for (auto& sam: obj.samples)
-            {
-	            std::cout << sam << "," << std::flush;
-	        }
-            std::cout << "\n" << std::flush;
-
-            std::cout << "[LOCALOPTIMIZER|DEBUG] Thread Param of Samples: " << std::flush;
-            for (auto& sam: obj.threads_samples)
-            {
-                std::cout << sam << "," << std::flush;
-            }
-            std::cout << "\n" << std::flush;
+      std::cout << "[LOCALOPTIMIZER|INFO] Initializing optimizer for new objective\n";
+      std::cout << "[LOCALOPTIMIZER|DEBUG] Samples: " << std::flush;
+      for (auto &sam : obj.samples)
+      {
+        std::cout << sam << "," << std::flush;
+      }
+      std::cout << "\n"
+                << std::flush;
+
+      std::cout << "[LOCALOPTIMIZER|DEBUG] Thread Param of Samples: " << std::flush;
+      for (auto &sam : obj.threads_samples)
+      {
+        std::cout << sam << "," << std::flush;
+      }
+      std::cout << "\n"
+                << std::flush;
 
 #ifdef ALLSCALE_HAVE_CPUFREQ
-            std::cout << "[LOCALOPTIMIZER|DEBUG] Freq Param of Samples: " << std::flush;
-            for (auto& sam: obj.freq_samples){
-                std::cout << sam << "," << std::flush;
-            }
-            std::cout << "\n" << std::flush;
+      std::cout << "[LOCALOPTIMIZER|DEBUG] Freq Param of Samples: " << std::flush;
+      for (auto &sam : obj.freq_samples)
+      {
+        std::cout << sam << "," << std::flush;
+      }
+      std::cout << "\n"
+                << std::flush;
 #endif
 #endif
-            int samplenr = obj.samples.size();
+      int samplenr = obj.samples.size();
 #ifdef ALLSCALE_HAVE_CPUFREQ
-            double params[3][2]={
-                {obj.threads_samples[samplenr-1],obj.freq_samples[samplenr-1]},
-                {obj.threads_samples[samplenr-2],obj.freq_samples[samplenr-2]},
-                {obj.threads_samples[samplenr-3],obj.freq_samples[samplenr-3]},
-            };
-            double values[3]={obj.samples[samplenr-1],obj.samples[samplenr-2],obj.samples[samplenr-3]};
-
-            double constraint_min[]={1,0};
-            double constraint_max[]={(double)max_threads_,
-                (double)frequencies_param_allowed_.size()};
-            std::cout << "initialize_simplex::Initializing with " << frequencies_param_allowed_.size() << " frequencies" << std::endl;
-            nmd.initialize_simplex(params,values,constraint_min,constraint_max);
-            objectives_[current_objective_idx_].initialized=true;
+      double params[3][2] = {
+          {obj.threads_samples[samplenr - 1], obj.freq_samples[samplenr - 1]},
+          {obj.threads_samples[samplenr - 2], obj.freq_samples[samplenr - 2]},
+          {obj.threads_samples[samplenr - 3], obj.freq_samples[samplenr - 3]},
+      };
+      double values[3] = {obj.samples[samplenr - 1], obj.samples[samplenr - 2], obj.samples[samplenr - 3]};
+      double min_threads = round(max_threads_ * 0.25);
+
+      if (min_threads < 1.0)
+        min_threads = 1.0;
+
+      double constraint_min[] = {min_threads, 0};
+      double constraint_max[] = {(double)max_threads_,
+                                 (double)frequencies_param_allowed_.size() - 1};
+      std::cout << "initialize_simplex::Initializing with " << frequencies_param_allowed_.size() << " frequencies" << std::endl;
+      nmd.initialize_simplex(params, values, constraint_min, constraint_max);
+      objectives_[current_objective_idx_].initialized = true;
 #endif
-        }
+    }
 
 #ifdef DEBUG_MULTIOBJECTIVE_
-        std::cout << "[LOCALOPTIMIZER|DEBG] Current Optimized Objective =";
-        switch (obj.type)
-        {
-            case energy:
-                std::cout << "********** Energy\n" << std::flush;
-                break;
-            case time:
-                std::cout << "&&&&&&&&&& Time\n" << std::flush;
-                break;
-            case resource:
-                std::cout << "oooooooooo Resource\n" << std::flush;
-                break;
-        }
-        std::cout << "[LOCALOPTIMIZER|DEBUG] Samples: " << std::flush;
-        for (auto& sam: obj.samples)
-        {
-            std::cout << sam << "," << std::flush;
-        }
-        std::cout << "\n" << std::flush;
+    std::cout << "[LOCALOPTIMIZER|DEBG] Current Optimized Objective =";
+    switch (obj.type)
+    {
+    case energy:
+      std::cout << "********** Energy\n"
+                << std::flush;
+      break;
+    case time:
+      std::cout << "&&&&&&&&&& Time\n"
+                << std::flush;
+      break;
+    case resource:
+      std::cout << "oooooooooo Resource\n"
+                << std::flush;
+      break;
+    }
+    std::cout << "[LOCALOPTIMIZER|DEBUG] Samples: " << std::flush;
+    for (auto &sam : obj.samples)
+    {
+      std::cout << sam << "," << std::flush;
+    }
+    std::cout << "\n"
+              << std::flush;
 
-        std::cout << "[LOCALOPTIMIZER|DEBUG] Freq Param of Samples: " << std::flush;
+    std::cout << "[LOCALOPTIMIZER|DEBUG] Freq Param of Samples: " << std::flush;
 #ifdef ALLSCALE_HAVE_CPUFREQ
-        for (auto& sam: obj.freq_samples)
-        {
-            std::cout << sam << "," << std::flush;
-        }
-        std::cout << "\n" << std::flush;
+    for (auto &sam : obj.freq_samples)
+    {
+      std::cout << sam << "," << std::flush;
+    }
+    std::cout << "\n"
+              << std::flush;
 #endif
 #endif
 
-        optstepresult nmd_res = nmd.step(obj.samples[0]);
+    optstepresult nmd_res = nmd.step(obj.samples[0]);
 #ifdef DEBUG_MULTIOBJECTIVE_
-        std::cout << "[LOCALOPTIMIZER|DEBUG] Calling NMD Optimizer Step, Param = \n";
-        std::cout << "[LOCALOPTIMIZER|DEBUG] New Vertex to try: ";
-        std::cout << "Threads = " << nmd_res.threads;
+    std::cout << "[LOCALOPTIMIZER|DEBUG] Calling NMD Optimizer Step, Param = \n";
+    std::cout << "[LOCALOPTIMIZER|DEBUG] New Vertex to try: ";
+    std::cout << "Threads = " << nmd_res.threads;
+#ifdef ALLSCALE_HAVE_CPUFREQ
+    std::cout << " Freq Idx = " << nmd_res.freq_idx << std::endl;
+#endif
+    std::cout << "Converg Thresh = " << convergence_threshold_ << std::endl;
+#endif
+    if (nmd_res.converged)
+    {
+      objectives_[current_objective_idx_].converged = true;
+      objectives_[current_objective_idx_].converged_minimum = nmd.getMinObjective();
+      double *minimization_point = nmd.getMinVertices();
+      objectives_[current_objective_idx_].minimization_params[0] =
+          minimization_point[0];
+      objectives_[current_objective_idx_].minimization_params[1] =
+          minimization_point[1];
+#ifdef DEBUG_CONVERGENCE_
+      std::cout << "[LOCALOPTIMIZER|INFO] NMD convergence\n";
+      std::cout << "******************************************" << std::endl;
+      std::cout << "[LOCALOPTIMIZER|INFO] Minimal Objective Value = " << objectives_[current_objective_idx_].converged_minimum << "Threads = " << minimization_point[0] << "Freq_idx = " << minimization_point[1] << std::endl;
+      std::cout << "******************************************" << std::endl;
+#endif
+      act.delta_threads = minimization_point[0];
 #ifdef ALLSCALE_HAVE_CPUFREQ
-        std::cout << " Freq Idx = " << nmd_res.freq_idx << std::endl;
+      act.frequency_idx = minimization_point[1];
 #endif
-        std::cout << "Converg Thresh = " << convergence_threshold_ << std::endl;
+      current_objective_idx_++;
+      if (current_objective_idx_ == objectives_.size())
+      {
+        converged_ = true;
+#ifdef DEBUG_CONVERGENCE_
+        std::cout << "[LOCALOPTIMIZER|INFO] ALL OBJECTIVES HAVE CONVERGED " << std::endl;
 #endif
-        if (nmd_res.converged)
+      }
+    }
+    else
+    {
+#if 0
+      // if a higher priority objective starts getting off leeway margin,
+      // decide convergence of the current param at this parameter point
+      if (current_objective_idx_ > 0)
+        for (int i = 0; i < current_objective_idx_; i++)
         {
+          objective priority_obj = objectives_[i];
+          double max_leeway_value = priority_obj.converged_minimum +
+                                    priority_obj.leeway * (priority_obj.globalmax - priority_obj.converged_minimum);
+          if (priority_obj.samples[0] > max_leeway_value &&
+              priority_obj.samples[1] > max_leeway_value)
+          {
             objectives_[current_objective_idx_].converged = true;
             objectives_[current_objective_idx_].converged_minimum = nmd.getMinObjective();
-            double* minimization_point = nmd.getMinVertices();
-            objectives_[current_objective_idx_].minimization_params[0]=
+            double *minimization_point = nmd.getMinVertices();
+            objectives_[current_objective_idx_].minimization_params[0] =
                 minimization_point[0];
-            objectives_[current_objective_idx_].minimization_params[1]=
+            objectives_[current_objective_idx_].minimization_params[1] =
                 minimization_point[1];
+
 #ifdef DEBUG_CONVERGENCE_
-            std::cout << "[LOCALOPTIMIZER|INFO] NMD convergence\n";
+            std::cout << "[LOCALOPTIMIZER|INFO] Leeway convergence\n";
             std::cout << "******************************************" << std::endl;
-            std::cout << "[LOCALOPTIMIZER|INFO] Minimal Objective Value = " <<
-                objectives_[current_objective_idx_].converged_minimum <<
-                "Threads = " << minimization_point[0] << "Freq_idx = " << minimization_point[1] <<
-                std::endl;
+            std::cout << "[LOCALOPTIMIZER|INFO] Minimal Objective Value = " << objectives_[current_objective_idx_].converged_minimum << "Threads = " << minimization_point[0] << "Freq_idx = " << minimization_point[1] << std::endl;
             std::cout << "******************************************" << std::endl;
 #endif
-            act.delta_threads=minimization_point[0];
+            // find the parameter point that scores the leeway margin value
+            act.delta_threads = (int)priority_obj.minimization_params[0] *
+                                (max_leeway_value / priority_obj.converged_minimum);
 #ifdef ALLSCALE_HAVE_CPUFREQ
-            act.frequency_idx=minimization_point[1];
+            act.frequency_idx = (int)priority_obj.minimization_params[1] *
+                                (max_leeway_value / priority_obj.converged_minimum);
 #endif
             current_objective_idx_++;
             if (current_objective_idx_ == objectives_.size())
             {
-                converged_=true;
+              converged_ = true;
 #ifdef DEBUG_CONVERGENCE_
-                std::cout << "[LOCALOPTIMIZER|INFO] ALL OBJECTIVES HAVE CONVERGED " << std::endl;
+              std::cout << "[LOCALOPTIMIZER|INFO] ALL OBJECTIVES HAVE CONVERGED " << std::endl;
 #endif
             }
-        }
-        else
-        {
-            // if a higher priority objective starts getting off leeway margin,
-            // decide convergence of the current param at this parameter point
-            if (current_objective_idx_>0)
-                for (int i=0;i<current_objective_idx_;i++)
-                {
-                    objective priority_obj=objectives_[i];
-                    double max_leeway_value = priority_obj.converged_minimum +
-                        priority_obj.leeway*(priority_obj.globalmax - priority_obj.converged_minimum);
-                    if (priority_obj.samples[0] > max_leeway_value &&
-                            priority_obj.samples[1] > max_leeway_value)
-                    {
-                        objectives_[current_objective_idx_].converged = true;
-                        objectives_[current_objective_idx_].converged_minimum = nmd.getMinObjective();
-                        double* minimization_point = nmd.getMinVertices();
-                        objectives_[current_objective_idx_].minimization_params[0]=
-                            minimization_point[0];
-                        objectives_[current_objective_idx_].minimization_params[1]=
-                            minimization_point[1];
-
-#ifdef DEBUG_CONVERGENCE_
-                        std::cout << "[LOCALOPTIMIZER|INFO] Leeway convergence\n";
-                        std::cout << "******************************************" << std::endl;
-                        std::cout << "[LOCALOPTIMIZER|INFO] Minimal Objective Value = " <<
-                            objectives_[current_objective_idx_].converged_minimum <<
-                            "Threads = " << minimization_point[0] << "Freq_idx = " << minimization_point[1] <<
-                            std::endl;
-                        std::cout << "******************************************" << std::endl;
-#endif
-                        // find the parameter point that scores the leeway margin value
-						act.delta_threads = (int)priority_obj.minimization_params[0]*
-                            (max_leeway_value/priority_obj.converged_minimum);
-#ifdef ALLSCALE_HAVE_CPUFREQ
-                        act.frequency_idx = (int)priority_obj.minimization_params[1]*
-                            (max_leeway_value/priority_obj.converged_minimum);
-#endif
-			            current_objective_idx_++;
-			            if (current_objective_idx_ == objectives_.size())
-                        {
-                            converged_=true;
-#ifdef DEBUG_CONVERGENCE_
-                            std::cout << "[LOCALOPTIMIZER|INFO] ALL OBJECTIVES HAVE CONVERGED " << std::endl;
-#endif
-                        }
-                        act.delta_threads=(nmd_res.threads==0)?getCurrentThreads():nmd_res.threads;
+            act.delta_threads = (nmd_res.threads == 0) ? getCurrentThreads() : nmd_res.threads;
 #ifdef ALLSCALE_HAVE_CPUFREQ
-                        act.frequency_idx=nmd_res.freq_idx;
+            act.frequency_idx = nmd_res.freq_idx;
 #endif
 
-                        goto validate_act;
-                    }
-    		}
+            goto validate_act;
+          }
         }
+#else
+  act.delta_threads = nmd_res.threads;
+  act.frequency_idx = nmd_res.freq_idx;
+#endif
     }
-    validate_act:
+  }
+validate_act:
 
-    if ( act.delta_threads > max_threads_) {
-      act.delta_threads = max_threads_;
-    } else if ( act.delta_threads < 1 ) {
-      act.delta_threads = getCurrentThreads();
-    }
+  if (act.delta_threads > max_threads_)
+  {
+    act.delta_threads = max_threads_;
+  }
+  else if (act.delta_threads < 1)
+  {
+    act.delta_threads = getCurrentThreads();
+  }
 #ifdef ALLSCALE_HAVE_CPUFREQ
-    // VV: If freq_idx is -1 then set it to last used frequency (frequency_param_)
-    if ( act.frequency_idx < 0)
-      act.frequency_idx= frequency_param_;
+  // VV: If freq_idx is -1 then set it to last used frequency (frequency_param_)
+  if (act.frequency_idx < 0)
+    act.frequency_idx = frequency_param_;
+  else if (act.frequency_idx > frequencies_param_allowed_.size() - 1)
+  {
+    act.frequency_idx = frequencies_param_allowed_.size() - 1;
+  }
 #endif
-    return act;
-}
-}
+  return act;
 }
+} // namespace components
+} // namespace allscale
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index a0c964e..b96664a 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -18,451 +18,507 @@
 
    vector<doubl
 
- */
+*/
 
-namespace allscale { namespace components {
+namespace allscale
+{
+namespace components
+{
 
 //NelderMead::NelderMead(double (*objfunc)(double[]),double eps){
-NelderMead::NelderMead(double eps){
+NelderMead::NelderMead(double eps)
+{
 
-  EPSILON=eps;
+    EPSILON = eps;
 #ifdef NMD_INFO_
-  std::cout << "[NelderMead|INFO] Initial Convergence Threshold set is " << EPSILON << std::endl;
+    std::cout << "[NelderMead|INFO] Initial Convergence Threshold set is " << EPSILON << std::endl;
 #endif
-  itr=0;
-  state_ = start;
-
-  /* dynamically allocate arrays */
-
-  /* allocate the rows of the arrays */
-  v =  (double **) malloc ((n+1) * sizeof(double *));
-  f =  (double *) malloc ((n+1) * sizeof(double));
-  vr = (double *) malloc (n * sizeof(double));
-  ve = (double *) malloc (n * sizeof(double));
-  vc = (double *) malloc (n * sizeof(double));
-  vm = (double *) malloc (n * sizeof(double));
-
-  /* allocate the columns of the arrays */
-  for (i=0;i<=n;i++) {
-    v[i] = (double *) malloc (n * sizeof(double));
-  }
+    itr = 0;
+    state_ = start;
+
+    /* dynamically allocate arrays */
+
+    /* allocate the rows of the arrays */
+    v = (double **)malloc((n + 1) * sizeof(double *));
+    f = (double *)malloc((n + 1) * sizeof(double));
+    vr = (double *)malloc(n * sizeof(double));
+    ve = (double *)malloc(n * sizeof(double));
+    vc = (double *)malloc(n * sizeof(double));
+    vm = (double *)malloc(n * sizeof(double));
+
+    /* allocate the columns of the arrays */
+    for (i = 0; i <= n; i++)
+    {
+        v[i] = (double *)malloc(n * sizeof(double));
+    }
 }
 
 void NelderMead::my_constraints(double x[])
 {
-  // round to integer and bring again with allowable margins
-  // todo fix: generalize
-  if (x[0] < constraint_min[0] || x[0] > constraint_max[0]){
-    x[0] = (constraint_min[0] + constraint_max[0])/2;
-  }
-
-  if (x[1] < constraint_min[1] || x[1] > constraint_max[1]){
-    x[1] = (constraint_min[1] + constraint_max[1])/2;
-  }
-
-  x[0]=round(x[0]);
-  x[1]=round(x[1]);
+    // round to integer and bring again with allowable margins
+    // todo fix: generalize
+
+    // if (x[0] < constraint_min[0] || x[0] > constraint_max[0]){
+    //   x[0] = (constraint_min[0] + constraint_max[0])/2;
+    // }
+
+    // if (x[1] < constraint_min[1] || x[1] > constraint_max[1]){
+    //   x[1] = (constraint_min[1] + constraint_max[1])/2;
+    // }
+
+    for (auto i = 0u; i < 2u; ++i)
+    {
+        if (x[i] < constraint_min[i])
+            x[i] = constraint_min[i];
+        else if (x[i] > constraint_max[i])
+            x[i] = constraint_max[i];
+    }
+
+    x[0] = round(x[0]);
+    x[1] = round(x[1]);
 }
 
 /* FIXME: generalize */
-void NelderMead::initialize_simplex(double params[][2], double values[], double constraint_min[],double constraint_max[])
+void NelderMead::initialize_simplex(double params[][2], double values[], double constraint_min[], double constraint_max[])
 {
-  int i,j;
+    int i, j;
 
-  for (i=0;i<=n;i++) {
-    for (j=0;j<n;j++) {
-  	  v[i][j] = params[i][j];
+    for (i = 0; i <= n; i++)
+    {
+        for (j = 0; j < n; j++)
+        {
+            v[i][j] = params[i][j];
+        }
+        f[i] = values[i];
+        this->constraint_min[i] = constraint_min[i];
+        this->constraint_max[i] = constraint_max[i];
     }
-    f[i]=values[i];
-    this->constraint_min[i]=constraint_min[i];
-    this->constraint_max[i]=constraint_max[i];
-  }
-  itr=0;
-}
+    itr = 0;
 
+    state_ = start;
+}
 
 /* print out the initial values */
 void NelderMead::print_initial_simplex()
 {
-  int i,j;
-  std::cout << "[NelderMead DEBUG] Initial Values\n";
-  for (j=0;j<=n;j++) {
-    for (i=0;i<n;i++) {
-      std::cout << v[j][i] << ",";
+    int i, j;
+    std::cout << "[NelderMead DEBUG] Initial Values\n";
+    for (j = 0; j <= n; j++)
+    {
+        for (i = 0; i < n; i++)
+        {
+            std::cout << v[j][i] << ",";
+        }
+        std::cout << " Objective value = " << f[j] << std::endl;
     }
-    std::cout << "Objective value = " << f[j] << std::endl;
-  }
 }
 
-
 /* print out the value at each iteration */
 void NelderMead::print_iteration()
 {
-  int i,j;
-  std::cout << "[NelderMead DEBUG] Iteration " << itr << std::endl;
-  //printf("Iteration %d\n",itr);
-  for (j=0;j<=n;j++) {
-    std::cout << "[NelderMead DEBUG] Vertex-" << j+1 << "=(";
-    for (i=0;i<n;i++) {
-      //printf("%f %f\n\n",v[j][i],f[j]);
-      std::cout << v[j][i];
-      if (i<n-1)
-        std::cout << "," ;
+    int i, j;
+    std::cout << "[NelderMead DEBUG] Iteration " << itr << std::endl;
+    //printf("Iteration %d\n",itr);
+    for (j = 0; j <= n; j++)
+    {
+        std::cout << "[NelderMead DEBUG] Vertex-" << j + 1 << "=(";
+        for (i = 0; i < n; i++)
+        {
+            //printf("%f %f\n\n",v[j][i],f[j]);
+            std::cout << v[j][i];
+            if (i < n - 1)
+                std::cout << ",";
+        }
+        std::cout << ")=" << f[j] << std::endl;
     }
-    std::cout << ")=" << f[j] << std::endl;
-  }
-  std::cout << "[NelderMead DEBUG] Current Objective Minimum is at: " << f[vs] << std::endl;
-  std::cout << "[NelderMead DEBUG] f[vs]= " << f[vs] << ", vs = " << vs << std::endl;
-  std::cout << "[NelderMead DEBUG] f[vh]= " << f[vh] << ", vh = " << vh << std::endl;
-  std::cout << "[NelderMead DEBUG] f[vg]= " << f[vg] << ", vg = " << vg << std::endl;
-}
 
+    std::cout << "[NelderMead DEBUG] Current Objective Minimum is at: " << f[vs] << std::endl;
+    std::cout << "[NelderMead DEBUG] f[vs]= " << f[vs] << ", vs = " << vs << std::endl;
+    std::cout << "[NelderMead DEBUG] f[vh]= " << f[vh] << ", vh = " << vh << std::endl;
+    std::cout << "[NelderMead DEBUG] f[vg]= " << f[vg] << ", vg = " << vg << std::endl;
+}
 
 /* find the index of the largest value */
 int NelderMead::vg_index()
 {
-  int j;
-  int vg=0;
-
-  for (j=0;j<=n;j++) {
-    if (f[j] > f[vg]) {
-      vg = j;
+    int j;
+    int vg = 0;
+
+    for (j = 0; j <= n; j++)
+    {
+        if (f[j] > f[vg])
+        {
+            vg = j;
+        }
     }
-  }
-  return vg;
+    return vg;
 }
 
-
 /* find the index of the smallest value */
 int NelderMead::vs_index()
 {
-  int j;
-  int vs=0;
-
-  for (j=0;j<=n;j++) {
-    if (f[j] < f[vs]) {
-      vs = j;
+    int j;
+    int vs = 0;
+
+    for (j = 0; j <= n; j++)
+    {
+        if (f[j] < f[vs])
+        {
+            vs = j;
+        }
     }
-  }
-  return vs;
+    return vs;
 }
 
-
 /* find the index of the second largest value */
 int NelderMead::vh_index()
 {
-  int j;
+    int j;
 
-  for (j=0;j<=n;j++) {
-    if (f[j] > f[vh] && f[j] < f[vg]) {
-      vh = j;
+    for (j = 0; j <= n; j++)
+    {
+        if (f[j] > f[vh] && f[j] < f[vg])
+        {
+            vh = j;
+        }
     }
-  }
-  return vh;
+    return vh;
 }
 
-
 /* calculate the centroid */
 void NelderMead::centroid()
 {
-  int j,m;
-  double cent;
-
-  for (j=0;j<=n-1;j++) {
-    cent=0.0;
-    for (m=0;m<=n;m++) {
-      if (m!=vg) {
-	      cent += v[m][j];
-      }
+    int j, m;
+    double cent;
+
+    for (j = 0; j <= n - 1; j++)
+    {
+        cent = 0.0;
+        for (m = 0; m <= n; m++)
+        {
+            if (m != vg)
+            {
+                cent += v[m][j];
+            }
+        }
+        vm[j] = cent / n;
     }
-    vm[j] = cent/n;
-  }
 }
 
-optstepresult NelderMead::step(double param)
+void NelderMead::sort_vertices()
 {
-  optstepresult res;
-  res.threads=0;
-  res.freq_idx=-1;
-  switch (state_){
+    // VV: -1 is used for padding because the index to this map will never evaluate to 0
+    int map_to_index[] = {
+        -1, 0, 1, 0, 2, 0, 0, 0};
 
-    /** ITERATION START **/
-    case start:
-      itr++;
+    vg = vs = vh = 0;
+
+    // VV: Compute greatest, smallest, and half-point
+    for (i = 0; i <= n; ++i)
+    {
+        vg = f[i] > f[vg] ? i : vg;
+        vs = f[i] < f[vs] ? i : vs;
+    }
+
+    // VV: Find out what's the half-point by using a bitmap,
+    //     when vg==vs that means that all points are equal
+    vh = 1 + 2 + 4 - (1 << vg) - (1 << vs);
+    vh = map_to_index[vh];
+}
+
+optstepresult NelderMead::do_step_start(double param)
+{
+    optstepresult res;
+
+    itr++;
 #ifdef NMD_DEBUG_
-      std::cout << "[NelderMead DEBUG] State = Start" << std::endl;
-      print_initial_simplex();
+    std::cout << "[NelderMead DEBUG] State = Start" << std::endl;
+    print_initial_simplex();
 #endif
-      // todo: implement here the simplex initialization, currently this is
-      // done in the constructor
-
-      /* find the index of the largest value (W) */
-      vg = vg_index();
-
-      /* find the index of the smallest value (B) */
-      vs = vs_index();
-
-      /* find the index of the second largest value (G) */
-      vh = vh_index();
-
-      /* calculate the centroid */
-      centroid();
-
-      /* reflect vg to new vertex vr */
-      for (j=0;j<=n-1;j++) {
-        /*vr[j] = (1+ALPHA)*vm[j] - ALPHA*v[vg][j];*/
-        /*
-        */
-        vr[j] = vm[j]+ALPHA*(vm[j]-v[vg][j]);
-
-        // std::cout << "vm[" << j << "]=" << vm[j] << std::endl;
-        // std::cout << "v[vg" << j << "]=" << v[vg][j] << std::endl;
-        // std::cout << "ALPHA=" << ALPHA << std::endl;
-        // std::cout << "Vr[" << j << "]=" << vr[j] << std::endl;
-      }
-      my_constraints(vr);
+    sort_vertices();
+
+    centroid();
+
+    for (j = 0; j <= n - 1; j++)
+    {
+        vr[j] = vm[j] + ALPHA * (vm[j] - v[vg][j]);
+    }
+    my_constraints(vr);
 #ifdef NMD_DEBUG_
-      std::cout << "[NelderMead DEBUG] Reflection Parameter = ("
-                << vr[0] << "," << vr[1] << ")"
-                << std::endl;
+    std::cout << "[NelderMead DEBUG] Reflection Parameter = ("
+              << vr[0] << "," << vr[1] << ")"
+              << std::endl;
 #endif
-      // enter reflection state
-      state_=reflection;
-      res.threads=vr[0];
-      res.freq_idx=vr[1];
-
-      break;
+    // enter reflection state
+    state_ = reflection;
+    res.threads = vr[0];
+    res.freq_idx = vr[1];
 
-    /** REFLECTION **/
+    return res;
+}
 
-    /** This state is entered when we have received a sample of the objective
-     ** function at the reflection vertex
-     **/
-    case reflection:
+optstepresult NelderMead::do_step_reflect(double param)
+{
+    optstepresult res;
 #ifdef NMD_DEBUG_
-      std::cout << "[NelderMead DEBUG] State = Reflection" << std::endl;
+    std::cout << "[NelderMead DEBUG] State = Reflection" << std::endl;
 #endif
-      fr=param;
-      //fr = objfunc(vr);
-
-		  if (fr < f[vh]){ // f(R) < f(G) - Case (i)
-        if (fr >= f[vs]) { // f(R)>f(B)
-          for (j=0;j<=n-1;j++) { // replace W with R and end iteration
-	         v[vg][j] = vr[j];
-          }
-          f[vg] = fr;
-          updateObjectives();
-          state_=start;
-          break;
+    fr = param;
+
+    std::cout << "fr:" << fr << " f[vh]:" << f[vh]
+              << " f[vs]:" << f[vs] << std::endl;
+
+    if ( (f[vs] <= fr) && (fr < f[vh]) ) {
+        // VV: REFLECTED point is better than the SECOND BEST
+        //     but NOT better than the BEST
+        //     Replace WORST point with REFLECTED
+        for (j = 0; j <= n - 1; j++)
+        {
+            v[vg][j] = vr[j];
         }
+        f[vg] = fr;
+        state_ = start;
+        return do_step_start(param);
+    } else if ( fr < f[vs] ) {
+        // VV: REFLECTED is better than BEST
+        
+        for ( j=0; j<=n-1; ++j)
+            ve[j] = vm[j] + GAMMA * (vr[j] - vm[j]);
+        
+        my_constraints(ve);
+        // VV: Now evaluate EXPANDED
+        res.threads = ve[0];
+        res.freq_idx = ve[1];
+
+        state_ = expansion;
+
+        return res;
+    } else if ( (f[vh] <= fr) && (fr < f[vg])) {
+        // VV: REFLECTED between SECOND BEST and WORST
+        
+        for ( j=0; j<=n-1; ++j)
+            vc[j] = vm[j] + BETA * (vr[j] - vm[j]);
+        
+        my_constraints(vc);
+
+        // VV: Now evaluate EXPANDED
+        res.threads = vc[0];
+        res.freq_idx = vc[1];
+
+        state_ = contraction;
+
+        return res;
+    } else {
+        // VV: REFLECTED worse than WORST
+        for ( j=0; j<=n-1; ++j)
+            vc[j] = vm[j] - BETA * (vr[j] - vm[j]);
+        
+        my_constraints(vc);
+
+        // VV: Now evaluate EXPANDED
+        res.threads = vc[0];
+        res.freq_idx = vc[1];
+
+        state_ = contraction;
+
+        return res;
+    }
+}
 
-        /* investigate a step further through expansion in this direction */
-        else{
-          for (j=0;j<=n-1;j++) {
-            /*ve[j] = GAMMA*vr[j] + (1-GAMMA)*vm[j];*/
-            ve[j] = vm[j]+GAMMA*(vr[j]-vm[j]);
-          }
+optstepresult NelderMead::do_step_expand(double param)
+{
 #ifdef NMD_DEBUG_
-      std::cout << "[NelderMead DEBUG] Expansion Parameter = ("
-                << ve[0] << "," << ve[1] << ")"
-                << std::endl;
+    std::cout << "[NelderMead DEBUG] State = Expansion" << std::endl;
 #endif
-          my_constraints(ve);
-          // enter the state waiting for a sampled value of the objective function
-          // at the expansion vertex
-          state_=expansion;
-          res.threads=ve[0];
-          res.freq_idx=ve[1];
-
-          break;
+    fe = param;
+
+    if (fe < fr)
+    {
+        // VV: EXPANDED point is better than REFLECTIVE
+        //     Replace WORST with EXPANDED
+        for (j = 0; j <= n - 1; j++)
+        {
+            v[vg][j] = ve[j];
         }
-
-      }else{ // f(R) > f(G) - Case (ii)
-        if (fr < f[vg]) { // f(R) < f(W)
-          for (j=0;j<=n-1;j++) {  // replace W with R
-           v[vg][j] = vr[j];
-          }
-          f[vg] = fr;
+        f[vg] = fe;
+    }
+    else
+    {
+        // VV: Replace WORST with REFLECTED
+        for (j = 0; j <= n - 1; j++)
+        {
+            v[vg][j] = vr[j];
         }
+        f[vg] = fr;
+    }
 
-        if (fr < f[vg] && fr >= f[vh]) {
-	        /* perform outside contraction */
-	        for (j=0;j<=n-1;j++) {
-	          /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j];*/
-	          vc[j] = vm[j]+BETA*(vr[j]-vm[j]);
-	        }
+    state_ = start;
+    
+    return do_step_start(param);
+}
+
+optstepresult NelderMead::do_step_contract(double param)
+{
+    int j;
 #ifdef NMD_DEBUG_
-      std::cout << "[NelderMead DEBUG] Contraction Parameter = ("
-                << vc[0] << "," << vc[1] << ")"
-                << std::endl;
+    std::cout << "[NelderMead|DEBUG] State = Contraction" << std::endl;
 #endif
-          my_constraints(vc);
-          // enter the state waiting for a sampled value of the objective function
-          // at the outside contraction vertex
-          state_=contraction;
-          res.threads=vc[0];
-          res.freq_idx=vc[1];
-          break;
-        } else {
-	        /* perform inside contraction */
-	        for (j=0;j<=n-1;j++) {
-	          /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j];*/
-	          vc[j] = vm[j]-BETA*(vm[j]-v[vg][j]);
-	        }
+    fc = param;
+
+    if ( fc <= fr ) {
+        // VV: CONTRACTED_O is better than REFLECTED
+        //     Replace WORST with CONTRACTED_O
+        for (j = 0; j <= n - 1; j++)
+        {
+            v[vg][j] = vc[j];
+        }
+        f[vg] = fc;
+
+        return do_step_start(param);
+    } else {
+        // VV: Replace SECOND BEST
+        for (j = 0; j <= n - 1; j++)
+            v[vh][j] = v[vs][j] + DELTA * (v[vh][j] - v[vs][j]);
+        
+        my_constraints(v[vh]);
+        // VV: Now evaluate SHRINK
+
+        optstepresult res;
+        res.threads = v[vh][0];
+        res.freq_idx = v[vh][1];
+        state_ = shrink;
+        return res;
+    }
+}
+
+optstepresult NelderMead::do_step_shrink(double param)
+{
 #ifdef NMD_DEBUG_
-      std::cout << "[NelderMead DEBUG] Contraction Parameter = ("
-                << vc[0] << "," << vc[1] << ")"
-                << std::endl;
+    std::cout << "[NelderMead|DEBUG] State = Shrink" << std::endl;
 #endif
-	        my_constraints(vc);
-          state_=contraction;
-          res.threads=vc[0];
-          res.freq_idx=vc[1];
-          break;
-        }
+    f[vh] = param;
+    return do_step_start(param);
+}
+
+optstepresult NelderMead::step(double param)
+{
+    int i, j;
 
+    optstepresult res;
+    res.threads = 0;
+    res.freq_idx = -1;
 
-    /** EXPANSION **/
+    switch (state_)
+    {
 
-    /** This state is entered when we have received a sample of the objective
-     ** function at the expansion vertex
-     **/
+    case start:
+        res = do_step_start(param);
+    break;
+    case reflection:
+        res = do_step_reflect(param);
+    break;
     case expansion:
-#ifdef NMD_DEBUG_
-      std::cout << "[NelderMead DEBUG] State = Expansion" << std::endl;
-#endif
-      fe=param;
-      //fe = objfunc(ve);
-      if (fe < f[vs]) { // if f(E)<f(B)
-  	    for (j=0;j<=n-1;j++) { // replace W with E
-  	      v[vg][j] = ve[j];
-  	    }
-  	    f[vg] = fe;
-      }
-      else {
-  	    for (j=0;j<=n-1;j++) { // replace W with E
-  	      v[vg][j] = vr[j];
-  	    }
-  	    f[vg] = fr;
-      }
-      updateObjectives();
-      state_=start;
-      break;
-
-    /** CONTRACTION **/
-
-    /** This state is entered when we have received a sample of the objective
-     ** function at the contraction vertex
-     **/
+        res = do_step_expand(param);
+    break;
     case contraction:
-#ifdef NMD_DEBUG_
-      std::cout << "[NelderMead|DEBUG] State = Contraction" << std::endl;
-#endif
-      fc=param;
-      //fc = objfunc(vc);
-      if (fc < f[vg]) { // f(C)<f(W)
-  	    for (j=0;j<=n-1;j++) {
-	        v[vg][j] = vc[j];
-	      }
-	      f[vg] = fc;
-      } else {
-        // apply shrinking
-	      for (row=0;row<=n;row++) {
-	        if (row != vs) {
-	          for (j=0;j<=n-1;j++) {
-	            v[row][j] = v[vs][j]+(v[row][j]-v[vs][j])/2.0;
-              my_constraints(v[row]);
-	         }
-	        }
-	      }
-      }
-      updateObjectives();
-      state_=start;
-      break;
+        res = do_step_contract(param);
+    break;
+    case shrink:
+        res = do_step_shrink(param);
+    break;
+    default:
+        std::cout << "Unknown NelderMead state (" << state_ << ")" << std::endl;
+        res.converged = false;
+        return res;
     }
-  }
 
-  /* print out the value at each iteration */
-#ifdef NMD_DEBUG_
-  print_iteration();
-#endif
-  res.converged=testConvergence();
-  return res;
+    res.converged = testConvergence();
+
+    if ( res.converged == true ) {
+        res.threads = v[vs][0];
+        res.freq_idx = v[vs][1];
+        std::cout << "Converged to " << res.threads << " " << res.freq_idx << std::endl;
+    }
+
+    return res;
 }
 
-bool NelderMead::testConvergence(){
-
-  fsum = 0.0;
-  for (j=0;j<=n;j++) {
-    fsum += f[j];
-  }
-  favg = fsum/(n+1);
-  s = 0.0;
-  for (j=0;j<=n;j++) {
-    s += pow((f[j]-favg),2.0)/(n);
-  }
-  s = sqrt(s);
-  s = s /favg; // normalization step
+bool NelderMead::testConvergence()
+{
+    double temp;
+
+    fsum = 0.0;
+    for (j = 0; j <= n; j++)
+    {
+        fsum += f[j];
+    }
+    favg = fsum / (n + 1);
+    s = 0.0;
+    for (j = 0; j <= n; j++)
+    {
+        temp = (f[j] - favg);
+        s += temp * temp / (n);
+    }
+    s = sqrt(s);
+    s = s / favg; // normalization step
 #ifdef NMD_INFO_
-  std::cout << "[NelderMead|INFO] Convergence Ratio is " << s << std::endl;
-  std::cout << "[NelderMead|INFO] Convergence Threshold set is " << EPSILON << std::endl;
+    std::cout << "[NelderMead|INFO] Convergence Ratio is " << s << std::endl;
+    std::cout << "[NelderMead|INFO] Convergence Threshold set is " << EPSILON << std::endl;
 #endif
-  if (s >= EPSILON && itr <= MAXITERATIONS)
-    return false;
-  else{
-    vs = vs_index();
-    min=f[vs];
-    return true;
-  }
+    if (s >= EPSILON && itr <= MAXITERATIONS)
+        return false;
+    else
+    {
+        vs = vs_index();
+        min = f[vs];
+        return true;
+    }
 }
 
-void NelderMead::updateObjectives(){
-  /* re-evaluate all the vertices */
-	/*for (j=0;j<=n;j++) {
-	  f[j] = objfunc(v[j]);
-	}
-  */
+void NelderMead::updateObjectives()
+{
+    /* re-evaluate all the vertices */
+    /*for (j=0;j<=n;j++) {
+                  f[j] = objfunc(v[j]);
+                  }
+                  */
 
-	/* find the index of the largest value */
-	vg = vg_index();
+    /* find the index of the largest value */
+    vg = vg_index();
 
-	/* find the index of the smallest value */
-	vs = vs_index();
+    /* find the index of the smallest value */
+    vs = vs_index();
 
-	/* find the index of the second largest value */
-	vh = vh_index();
+    /* find the index of the second largest value */
+    vh = vh_index();
 
-  my_constraints(v[vg]);
+    my_constraints(v[vg]);
 
-	//f[vg] = objfunc(v[vg]);
+    //f[vg] = objfunc(v[vg]);
 
-	my_constraints(v[vh]);
+    my_constraints(v[vh]);
 
-  //f[vh] = objfunc(v[vh]);
+    //f[vh] = objfunc(v[vh]);
 }
 
-}
-}
+} // namespace components
+} // namespace allscale
 /*
 
-std::vector<double> NelderMead::minimum(){
-
-
-  free(f);
-  free(vr);
-  free(ve);
-  free(vc);
-  free(vm);
-  for (i=0;i<=n;i++) {
-    free (v[i]);
-  }
-  free(v);
-  return min;
-
-
-}
-*/
+       std::vector<double> NelderMead::minimum(){
 
 
+       free(f);
+       free(vr);
+       free(ve);
+       free(vc);
+       free(vm);
+       for (i=0;i<=n;i++) {
+       free (v[i]);
+       }
+       free(v);
+       return min;
 
 
+       }
+       */
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 678d539..e2b7df9 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -490,8 +490,27 @@ void scheduler::init() {
     lopt_.reset(os_thread_count,0);
   #if defined(ALLSCALE_HAVE_CPUFREQ)
     using hardware_reconf = allscale::components::util::hardware_reconf;
+    auto  freqs = hardware_reconf::get_frequencies(0);
+
+    const std::size_t max_freqs = 5;
+    std::size_t keep_every = (std::size_t) ceilf(freqs.size() / (float) max_freqs);
+
+    if ( keep_every > 1 ) {
+      std::vector<unsigned long> new_freqs;
+
+      int i, j, len;
+
+      for (j=0, i=0, len=freqs.size(); i<len; ++i ) {
+        if ( (i==len-1) || ( (i % keep_every) == 0 )) {
+          new_freqs.push_back(freqs[i]);
+        }
+      }      
+
+        freqs = new_freqs;
+    }
+
     std::vector<unsigned long> freq_temp =
-      lopt_.setfrequencies(hardware_reconf::get_frequencies(0));
+      lopt_.setfrequencies(freqs);
     if (freq_temp.empty()){
       HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init",
       "error in initializing the local optimizer, allowed frequency values are empty");
diff --git a/src/components/util/hardware_reconf.cpp b/src/components/util/hardware_reconf.cpp
index 4cf1491..b515977 100644
--- a/src/components/util/hardware_reconf.cpp
+++ b/src/components/util/hardware_reconf.cpp
@@ -5,6 +5,7 @@
 #include <memory>
 #include <mutex>
 #include <cpufreq.h>
+#include <algorithm>    // std::sort
 
 #include <boost/format.hpp>
 
@@ -25,6 +26,7 @@ namespace allscale { namespace components { namespace util {
         if (available_frequencies != nullptr)
             cpufreq_put_available_frequencies(available_frequencies);
 
+        std::sort(frequencies.begin(), frequencies.end());
         return frequencies;
     }
 

From 9fafe64bd58e380396e27b540ea4c6fdf0b4bdda Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Tue, 13 Nov 2018 16:47:47 +0000
Subject: [PATCH 05/37] Explore all objectives at the same time

easier integration with dashboard
---
 allscale/components/localoptimizer.hpp   |  41 +-
 allscale/components/nmsimplex_bbincr.hpp | 243 ++++----
 allscale/components/scheduler.hpp        |   6 +-
 src/components/localoptimizer.cpp        | 664 ++++++++--------------
 src/components/nmsimplex_bbincr.cpp      | 690 +++++++++++++++++------
 src/components/scheduler_component.cpp   |  20 +-
 6 files changed, 959 insertions(+), 705 deletions(-)

diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index 1f7aae0..d9799cc 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -3,6 +3,7 @@
 #define ALLSCALE_COMPONENTS_LOCALOPTIMIZER_HPP
 
 #include <allscale/components/nmsimplex_bbincr.hpp>
+
 #if defined(ALLSCALE_HAVE_CPUFREQ)
 #include <allscale/util/hardware_reconf.hpp>
 #endif
@@ -20,7 +21,7 @@
 
 namespace allscale { namespace components {
 
-    enum objectiveType {time,energy,resource};
+    enum objectiveType {time, energy, resource};
 
     enum parameterType {thread, frequency};
 
@@ -28,6 +29,8 @@ namespace allscale { namespace components {
 
     /* structure type of a single optimization objective */
     struct objective{
+      double last_scores[3];
+
       objectiveType type;
       /* leeway threshold desired, 0-1 double */
       double leeway;
@@ -60,7 +63,7 @@ namespace allscale { namespace components {
       double minimization_params[2];
     };
 
-
+    
     /* structure type modelling an optimization actuation action to be taken
        by the scheduler */
     struct actuation{
@@ -81,14 +84,19 @@ namespace allscale { namespace components {
     {
         localoptimizer()
             :nmd(0.01),
+            pending_threads(0.),
+            pending_energy(0.),
+            pending_time(0.),
+            pending_num_times(0.),
+            mo_initialized(false),
 #if defined(ALLSCALE_HAVE_CPUFREQ)
             frequency_param_(0),
 #endif
             current_objective_idx_(0),converged_(false)
     {
-            if (optmethod_==random)
-                srand (std::time(NULL));
-            }
+        if (optmethod_==random)
+            srand (std::time(NULL));
+        }
 
         localoptimizer(std::list<objective>);
 
@@ -99,10 +107,14 @@ namespace allscale { namespace components {
 #ifdef DEBUG_
           std::cout << "Local Optimizer Initialized with "
                     << policyToString(pol)
-                    << " policy for single objective search."
+                    << " policy for multi-objective search."
                     << std::endl;
 #endif
         }
+#ifdef ALLSCALE_HAVE_CPUFREQ
+        void initialize_nmd();
+#endif
+        double opt_weights[NMD_NUM_OBJECTIVES];
 
         searchPolicy getPolicy(){return optmethod_;}
 
@@ -166,6 +178,23 @@ namespace allscale { namespace components {
         }
 
     private:
+        void accumulate_objective_measurements();
+        void reset_accumulated_measurements();
+
+        std::vector<double> samples_energy;
+        std::vector<double> samples_time;
+        std::vector<double> samples_threads;
+        std::vector<double> samples_freq;
+
+        bool explore_knob_domain;
+        
+        double initialization_samples[NMD_NUM_KNOBS+1][NMD_NUM_OBJECTIVES];
+        double initialization_params[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS];
+
+        double pending_time, pending_energy, pending_threads;
+        unsigned long pending_num_times;
+
+        bool mo_initialized;
 
         /* vector of active optimization objectives. Objectives are stored
            in the vector in decreasing priority order */
diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp
index ea4f3bd..e87fe8c 100644
--- a/allscale/components/nmsimplex_bbincr.hpp
+++ b/allscale/components/nmsimplex_bbincr.hpp
@@ -18,146 +18,195 @@
 #include <stdlib.h>
 #include <math.h>
 
+#include <chrono>
+#include <utility>
+#include <map>
+
 #ifdef MACOSX
 #include <malloc/malloc.h>
 #else
 #include <malloc.h>
 #endif
 
-namespace allscale { namespace components {
+namespace allscale
+{
+namespace components
+{
+
+// VV: threads, freq_idx
+#define NMD_NUM_KNOBS 2
+// VV: time, energy/power, resources
+#define NMD_NUM_OBJECTIVES 3
+
+#define MAX_IT 1000 /* maximum number of iterations */
+#define ALPHA 1.0   /* reflection coefficient */
+#define BETA 0.5    /* contraction coefficient */
+#define GAMMA 2.0   /* expansion coefficient */
+#define DELTA 0.5   /* shrinking coefficient */
 
-#define MAX_IT      1000      /* maximum number of iterations */
-#define ALPHA       1.0       /* reflection coefficient */
-#define BETA        0.5       /* contraction coefficient */
-#define GAMMA       2.0       /* expansion coefficient */
-#define DELTA       0.5        /* shrinking coefficient */
+#define CACHE_EXPIRE_AFTER_MS 5000
 
 /* structure type of a single optimization step return status */
-struct optstepresult{
-      /* true if optimization has converged for the specified objective */
-      bool converged;
-      /* number of threads for parameters to set for sampling */
-      double threads;
-      /* index to frequency vector for freq parameter to set for sampling*/
-      int freq_idx;
+struct optstepresult
+{
+  /* true if optimization has converged for the specified objective */
+  bool converged;
+  /* number of threads for parameters to set for sampling */
+  double threads;
+  /* index to frequency vector for freq parameter to set for sampling*/
+  int freq_idx;
+
+  /******VV: Cache stuff******/
+  double score;
+  double objectives[3]; // (time, energy, resource)
+  // VV: _cache_expires denotes dt (in ms) after _cache_timestamp
+  int64_t _cache_timestamp, _cache_expires_dt;
 };
 
+typedef std::map<std::pair<int, int>, optstepresult> MapCache_t;
+
 /* enumeration encoding state that the incremental Nelder Mead optimizer is at */
-enum iterationstates {start, reflection, expansion,
-                      contraction, shrink};
+enum iterationstates
+{
+  start,
+  reflection,
+  expansion,
+  contraction,
+  shrink
+};
 
-class NelderMead {
+class NelderMead
+{
+
+public:
+  NelderMead(double);
+  // VV: For the time being params = [threads, freq_idx]
+  //     objectives = [time, energy/power, resources]
+  //     weights = [ W_time, W_energy/power, W_resources ]
+  //     constraint_min = [min_threads, min_freq_idx]
+  void initialize_simplex(double params[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS],
+                          double objectives[][NMD_NUM_OBJECTIVES],
+                          double weights[NMD_NUM_OBJECTIVES],
+                          double constraint_min[NMD_NUM_KNOBS],
+                          double constraint_max[NMD_NUM_KNOBS]);
+  void print_initial_simplex();
+  void print_iteration();
+  
+  double *getMinVertices()
+  {
+    return v[vs];
+  }
+
+  double getMinObjective()
+  {
+    return min;
+  }
+
+  unsigned long int getIterations() { return itr; }
+  double evaluate_score(const double objectives[], const double *weights) const;
+  void set_weights(double weights[]);
+
+  optstepresult step(const double objectives[]);
+private:
+  //VV: objective_type: { <threads, cpu-freq>: optstepresult }
+  MapCache_t cache_;
+  
+  optstepresult do_step_start();
+  optstepresult do_step_reflect(const double objectives[]);
+  optstepresult do_step_expand(const double objectives[]);
+  optstepresult do_step_contract(const double objectives[]);
+  optstepresult do_step_shrink(const double objectives[]);
 
-  public:
-    NelderMead(double);
-    void initialize_simplex(double params[][2], double*,double*,double*);
-    void print_initial_simplex();
-    void print_iteration();
-    optstepresult step(double param);
-    double* getMinVertices(){
-        return v[vs];
-    }
+  bool knob_set_exists(double knobs[2], int exclude);
 
-    double getMinObjective(){
-        return min;
-    }
+  void sort_vertices(void);
+  void my_constraints(double *);
+  void centroid();
+  bool testConvergence();
 
-    unsigned long int getIterations(){return itr;}
+  // VV: Will return false if entry not in cache
+  bool cache_update(int threads, int freq_idx, 
+                    const double objectives[],
+                    bool add_if_new);
 
-  private:
+  double round2(double num, int precision)
+  {
+    double rnum = 0.0;
+    int tnum;
 
-    optstepresult do_step_start(double param);
-    optstepresult do_step_reflect(double param);
-    optstepresult do_step_expand(double param);
-    optstepresult do_step_contract(double param);
-    optstepresult do_step_shrink(double param);
+    if (num == 0.0)
+      return num;
 
-    int vg_index();
-    int vs_index();
-    int vh_index();
-    void sort_vertices(void);
-    void my_constraints(double*);
-    void centroid();
-    bool testConvergence();
-    void updateObjectives();
+    rnum = num * pow(10, precision);
+    tnum = (int)(rnum < 0 ? rnum - 0.5 : rnum + 0.5);
+    rnum = tnum / pow(10, precision);
 
-    double round2(double num, int precision)
-    {
-      double rnum = 0.0;
-      int tnum;
+    return rnum;
+  }
 
-      if (num == 0.0)
-        return num;
+  /* vertex with smallest value */
+  int vs;
 
-      rnum = num*pow(10,precision);
-      tnum = (int)(rnum < 0 ? rnum-0.5 : rnum + 0.5);
-      rnum = tnum/pow(10,precision);
+  /* vertex with next smallest value */
+  int vh;
 
-      return rnum;
-    }
+  /* vertex with largest value */
+  int vg;
 
-    /* vertex with smallest value */
-    int vs;         
+  int i, j, row;
+  
+  const int n = 2;
 
-     /* vertex with next smallest value */
-    int vh;        
+  /* track the number of function evaluations */
+  int k;
 
-    /* vertex with largest value */
-    int vg;         
-	
-    int i,j,row;
+  /* track the number of iterations */
+  int itr;
 
-    const int n=2;
+  /* holds vertices of simplex */
+  double **v;
 
-    /* track the number of function evaluations */
-    int k;
+  /* value of function at each vertex */
+  double *f;
 
-    /* track the number of iterations */
-    int itr;	  
-	
-    /* holds vertices of simplex */
-    double **v;
+  /* value of function at reflection point */
+  double fr;
 
-     /* value of function at each vertex */
-    double *f;
+  /* value of function at expansion point */
+  double fe;
 
-    /* value of function at reflection point */
-    double fr;      
+  /* value of function at contraction point */
+  double fc;
 
-    /* value of function at expansion point */
-    double fe;      
+  /* reflection - coordinates */
+  double *vr;
 
-     /* value of function at contraction point */
-    double fc;      
+  /* expansion - coordinates */
+  double *ve;
 
-    /* reflection - coordinates */
-    double *vr;    
+  /* contraction - coordinates */
+  double *vc;
 
-    /* expansion - coordinates */
-    double *ve;     
+  /* centroid - coordinates */
+  double *vm;
 
-    /* contraction - coordinates */
-    double *vc;     
+  double min;
 
-    /* centroid - coordinates */
-    double *vm;     
+  double fsum, favg, s;
 
-    double min;
-	
-    double fsum,favg,s;
+  double EPSILON;
 
-    double EPSILON;
+  iterationstates state_;
 
-    iterationstates state_;
+  const int MAXITERATIONS = 15;
 
-    const int MAXITERATIONS = 15;
-  
-    double constraint_min[2];
+  double constraint_min[2];
 
-    double constraint_max[2];
+  double constraint_max[2];
 
+  double opt_weights[NMD_NUM_OBJECTIVES];
 };
 
-}
-}
+} // namespace components
+} // namespace allscale
 #endif
diff --git a/allscale/components/scheduler.hpp b/allscale/components/scheduler.hpp
index 7eed6e5..c508900 100644
--- a/allscale/components/scheduler.hpp
+++ b/allscale/components/scheduler.hpp
@@ -5,8 +5,10 @@
 #include <allscale/work_item.hpp>
 #include <allscale/components/treeture_buffer.hpp>
 #include <allscale/components/localoptimizer.hpp>
+
 #if defined(ALLSCALE_HAVE_CPUFREQ)
 #include <allscale/util/hardware_reconf.hpp>
+#else
 #endif
 
 #include <hpx/include/components.hpp>
@@ -108,7 +110,7 @@ namespace allscale { namespace components {
         long last_optimization_timestamp_;
 
         /* periodicity in milliseconds to invoke the optimizer */
-        const long optimization_period_ms = 5;
+        const long optimization_period_ms = 5000;
 
         /* captures absolute timestamp of the last time optimization
            objective value have been measured (sampled) */
@@ -117,7 +119,7 @@ namespace allscale { namespace components {
         long last_objective_measurement_timestamp_;
 
         /* periodicity in milliseconds to invoke objective sampling */
-        const long objective_measurement_period_ms = 1;
+        const long objective_measurement_period_ms = 1000;
 
         //extra masks to better handle suspending/resuming threads
         std::vector<hpx::threads::thread_pool_base*> thread_pools_;
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index 593853b..04ef472 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -24,487 +24,315 @@ namespace allscale
 {
 namespace components
 {
-
 localoptimizer::localoptimizer(std::list<objective> targetobjectives)
-    : objectives_((int)targetobjectives.size()),
-      nmd(convergence_threshold_),
-      param_changes_(0),
-      steps_(0),
-      current_param_(thread),
-      converged_(false)
+	: objectives_((int)targetobjectives.size()),
+	  nmd(convergence_threshold_),
+	  param_changes_(0),
+	  steps_(0),
+	  current_param_(thread),
+	  converged_(false)
 {
-  for (objective o : targetobjectives)
-  {
-    //std::cout << o.type << "," << o.leeway << "," << o.priority << '\n';
-    objectives_[o.priority] = o;
-    objectives_[o.priority].localmin = 10000;
-    objectives_[o.priority].globalmin = 10000;
-    objectives_[o.priority].localmax = 0.0;
-    objectives_[o.priority].globalmax = 0.0;
-    objectives_[o.priority].converged = false;
-    objectives_[o.priority].initialized = false;
-    objectives_[o.priority].min_params_idx = 0;
-    objectives_[o.priority].converged_minimum = 0;
-  }
+	for (objective o : targetobjectives)
+	{
+		//std::cout << o.type << "," << o.leeway << "," << o.priority << '\n';
+		objectives_[o.priority] = o;
+		objectives_[o.priority].localmin = 10000;
+		objectives_[o.priority].globalmin = 10000;
+		objectives_[o.priority].localmax = 0.0;
+		objectives_[o.priority].globalmax = 0.0;
+		objectives_[o.priority].converged = false;
+		objectives_[o.priority].initialized = false;
+		objectives_[o.priority].min_params_idx = 0;
+		objectives_[o.priority].converged_minimum = 0;
+	}
 #ifdef ALLSCALE_HAVE_CPUFREQ
-  setCurrentFrequencyIdx(0);
+	setCurrentFrequencyIdx(0);
 #endif
 };
 
 void localoptimizer::setobjectives(std::list<objective> targetobjectives)
 {
-  objectives_.clear();
-  objectives_.resize((int)targetobjectives.size());
-  for (objective o : targetobjectives)
-  {
-    //std::cout << o.type << "," << o.leeway << "," << o.priority << '\n';
-    objectives_[o.priority] = o;
-    objectives_[o.priority].localmin = 10000;
-    objectives_[o.priority].globalmin = 10000;
-    objectives_[o.priority].localmax = 0.0;
-    objectives_[o.priority].globalmax = 0.0;
-    objectives_[o.priority].converged = false;
-    objectives_[o.priority].initialized = false;
-    objectives_[o.priority].min_params_idx = 0;
-    objectives_[o.priority].converged_minimum = 0;
-  }
-  steps_ = 0;
-  param_changes_ = 0;
-  current_param_ = thread;
+	objectives_.clear();
+	objectives_.resize((int)targetobjectives.size());
+
+	explore_knob_domain = true;
+
+	for (objective o : targetobjectives)
+	{
+		//std::cout << o.type << "," << o.leeway << "," << o.priority << '\n';
+		objectives_[o.priority] = o;
+		objectives_[o.priority].localmin = 10000;
+		objectives_[o.priority].globalmin = 10000;
+		objectives_[o.priority].localmax = 0.0;
+		objectives_[o.priority].globalmax = 0.0;
+		objectives_[o.priority].converged = false;
+		objectives_[o.priority].initialized = false;
+		objectives_[o.priority].min_params_idx = 0;
+		objectives_[o.priority].converged_minimum = 0;
+
+		opt_weights[o.type] = o.leeway;
+	}
+	steps_ = 0;
+	param_changes_ = 0;
+	current_param_ = thread;
 #ifdef ALLSCALE_HAVE_CPUFREQ
-  setCurrentFrequencyIdx(0);
+	setCurrentFrequencyIdx(0);
 #endif
-  converged_ = false;
+	converged_ = false;
 }
 
 void localoptimizer::reset(int threads, int freq_idx)
 {
-  threads_param_ = threads;
-  param_changes_ = 0;
-  thread_param_values_.clear();
+	threads_param_ = threads;
+	param_changes_ = 0;
+	thread_param_values_.clear();
 #ifdef ALLSCALE_HAVE_CPUFREQ
-  frequency_param_ = freq_idx;
-  frequency_param_values_.clear();
+	frequency_param_ = freq_idx;
+	frequency_param_values_.clear();
 #endif
-  current_objective_idx_ = 0;
-  steps_ = 0;
-  current_param_ = thread;
-  converged_ = false;
+	current_objective_idx_ = 0;
+	steps_ = 0;
+	current_param_ = thread;
+	converged_ = false;
 };
 
 #ifdef DEBUG_
 void localoptimizer::printobjectives()
 {
-  for (auto &el : objectives_)
-  {
-    std::cout << "Objective"
-              << "\t\t"
-              << "Priority"
-              << "\t\t"
-              << "Leeway" << std::endl;
-    switch (el.type)
-    {
-    case time:
-      std::cout << "Time"
-                << "\t\t" << el.priority << "\t\t" << el.leeway << std::endl;
-      break;
-    case energy:
-      std::cout << "Energy"
-                << "\t\t" << el.priority << "\t\t" << el.leeway << std::endl;
-      break;
-    case resource:
-      std::cout << "Resource"
-                << "\t\t" << el.priority << "\t\t" << el.leeway << std::endl;
-      break;
-    }
-  }
+	for (auto &el : objectives_)
+	{
+		std::cout << "Objective"
+				  << "\t\t"
+				  << "Priority"
+				  << "\t\t"
+				  << "Leeway" << std::endl;
+		switch (el.type)
+		{
+		case time:
+			std::cout << "Time"
+					  << "\t\t" << el.priority << "\t\t" << el.leeway << std::endl;
+			break;
+		case energy:
+			std::cout << "Energy"
+					  << "\t\t" << el.priority << "\t\t" << el.leeway << std::endl;
+			break;
+		case resource:
+			std::cout << "Resource"
+					  << "\t\t" << el.priority << "\t\t" << el.leeway << std::endl;
+			break;
+		}
+	}
 }
 
 void localoptimizer::printverbosesteps(actuation act)
 {
-  static int last_frequency_idx = 0;
-
-  std::cout << "[INFO]";
-  if (optmethod_ == random)
-    std::cout << "Random ";
-  else if (optmethod_ == allscale)
-  {
-    std::cout << "Allscale ";
-  }
-  std::cout << "Scheduler Step: Setting OS Threads to " << threads_param_;
+	static int last_frequency_idx = 0;
+
+	std::cout << "[INFO]";
+	if (optmethod_ == random)
+		std::cout << "Random ";
+	else if (optmethod_ == allscale)
+	{
+		std::cout << "Allscale ";
+	}
+	std::cout << "Scheduler Step: Setting OS Threads to " << threads_param_;
 #ifdef ALLSCALE_HAVE_CPUFREQ
-  if (act.frequency_idx >= 0)
-    last_frequency_idx = act.frequency_idx;
-  std::cout << " , CPU Frequency to " << frequencies_param_allowed_[last_frequency_idx]
-            << std::endl;
+	if (act.frequency_idx >= 0)
+		last_frequency_idx = act.frequency_idx;
+	std::cout << " , CPU Frequency to " << frequencies_param_allowed_[last_frequency_idx]
+			  << std::endl;
 #else
-  std::cout << std::endl;
+	std::cout << std::endl;
 #endif
 }
 
 #endif
 
-void localoptimizer::measureObjective(double iter_time, double power, double threads)
+void localoptimizer::accumulate_objective_measurements()
 {
-  std::cout << "Measuring objective: "
-            << iter_time << " "
-            << power << " "
-            << threads << std::endl;
-
-  for (auto &el : objectives_)
-  {
-    switch (el.type)
-    {
-    case time:
-      el.samples.insert(el.samples.begin(), iter_time);
-      if (el.samples.size() > 1000)
-        el.samples.resize(500);
-
-      el.threads_samples.insert(el.threads_samples.begin(), threads);
-      if (el.threads_samples.size() > 1000)
-        el.threads_samples.resize(500);
+	if (pending_num_times)
+	{
+		pending_time /= (double)pending_num_times;
+		pending_threads /= (double)pending_num_times;
+		pending_energy /= (double)pending_num_times;
+		pending_num_times = 0;
+	}
+}
 
 #ifdef ALLSCALE_HAVE_CPUFREQ
-      el.freq_samples.insert(el.freq_samples.begin(), getCurrentFrequencyIdx());
-      if (el.freq_samples.size() > 1000)
-        el.freq_samples.resize(500);
-#endif
+void localoptimizer::initialize_nmd()
+{
+	// VV: Retrieve measurements for last exploration
+	if ( steps_ == warmup_steps_ +1 )
+	{
+		accumulate_objective_measurements();
 
-      if (el.globalmin > iter_time)
-      {
-        el.globalmin = iter_time;
-        el.min_params_idx = param_changes_;
-      }
-      if (el.globalmax < iter_time)
-        el.globalmax = iter_time;
-#ifdef DEBUG__
-      std::cout << "Iteration Time Minimum: " << el.globalmin << std::endl;
-      std::cout << "Iteration Time Maximum: " << el.globalmax << std::endl;
-      std::cout << "Iteration Time Samples: ";
-      for (auto &samp : el.samples)
-        std::cout << samp << ",";
-      std::cout << std::endl;
-#endif
-      break;
-    case energy:
-      el.samples.insert(el.samples.begin(), power);
-      if (el.samples.size() > 1000)
-        el.samples.resize(500);
+		initialization_samples[steps_ - 2][0] = pending_time;
+		initialization_samples[steps_ - 2][1] = pending_energy;
+		initialization_samples[steps_ - 2][2] = pending_threads;
 
-      el.threads_samples.insert(el.threads_samples.begin(), threads);
-      if (el.threads_samples.size() > 1000)
-        el.threads_samples.resize(500);
+		reset_accumulated_measurements();
 
-#ifdef ALLSCALE_HAVE_CPUFREQ
-      el.freq_samples.insert(el.freq_samples.begin(), getCurrentFrequencyIdx());
-      if (el.freq_samples.size() > 1000)
-        el.freq_samples.resize(500);
-#endif
+		initialization_params[steps_ - 2][1] = getCurrentFrequencyIdx();
+	}
+	
+	// VV: Place reasonable limits to #threads and cpu_freq tunable knobs
+	double min_threads = round(max_threads_ * 0.25);
 
-      if (el.globalmin > power)
-      {
-        el.globalmin = power;
-        el.min_params_idx = param_changes_;
-      }
-      if (el.globalmax < power)
-        el.globalmax = power;
-#ifdef DEBUG__
-      std::cout << "Power Consumption Minimum: " << el.globalmin << std::endl;
-      std::cout << "Power Consumption Maximum: " << el.globalmax << std::endl;
-      std::cout << "Power Consumption Samples: ";
-      for (auto &samp : el.samples)
-        std::cout << samp << ",";
-      std::cout << std::endl;
-#endif
-      break;
-    case resource:
-      el.samples.insert(el.samples.begin(), threads);
-      if (el.samples.size() > 1000)
-        el.samples.resize(500);
+	if (min_threads < 1.0)
+		min_threads = 1.0;
 
-      el.threads_samples.insert(el.threads_samples.begin(), threads);
-      if (el.threads_samples.size() > 1000)
-        el.threads_samples.resize(500);
+	double constraint_min[] = {min_threads, 0};
+	double constraint_max[] = {(double)max_threads_,
+							   (double)frequencies_param_allowed_.size() - 1};
 
-#ifdef ALLSCALE_HAVE_CPUFREQ
-      el.freq_samples.insert(el.freq_samples.begin(), getCurrentFrequencyIdx());
-      if (el.freq_samples.size() > 1000)
-        el.freq_samples.resize(500);
-#endif
+	nmd.initialize_simplex(initialization_params, 
+						   initialization_samples,
+						   opt_weights,
+						   constraint_min, constraint_max);
 
-      if (el.globalmin > threads)
-      {
-        el.globalmin = threads;
-        el.min_params_idx = param_changes_;
-      }
-      if (el.globalmax < threads)
-        el.globalmax = threads;
-#ifdef DEBUG__
-      std::cout << "Threads Minimum: " << el.globalmin << std::endl;
-      std::cout << "Threads Maximum: " << el.globalmax << std::endl;
-      std::cout << "Threads Samples: ";
-      for (auto &samp : el.samples)
-        std::cout << samp << ",";
-      std::cout << std::endl;
-#endif
-      break;
-    }
-  }
+	mo_initialized = true;
+	explore_knob_domain = true;
 }
-
-actuation localoptimizer::step()
-{
-  steps_++;
-  actuation act;
-  act.delta_threads = threads_param_;
-#ifdef ALLSCALE_HAVE_CPUFREQ
-  act.frequency_idx = frequency_param_;
-#endif
-  /* random optimization step */
-  if (optmethod_ == random)
-  {
-    act.delta_threads = (rand() % max_threads_);
-#ifdef ALLSCALE_HAVE_CPUFREQ
-    act.frequency_idx = rand() % frequencies_param_allowed_.size();
-    // if (act.frequency_idx == frequency_param_)
-    //     act.frequency_idx = -1;
 #endif
-  }
-
-  else if (optmethod_ == allscale)
-  {
-    if (current_objective_idx_ > objectives_.size())
-      goto validate_act;
 
-    if (steps_ < warmup_steps_)
-    {
+void localoptimizer::measureObjective(double iter_time, double power, double threads)
+{
+	std::cout << "Measuring objective: "
+			  << iter_time << " "
+			  << power << " "
+			  << threads << std::endl;
+
+	if (steps_)
+	{
+		pending_time += iter_time;
+		pending_energy += power;
+		pending_threads += threads;
+		pending_num_times++;
+	}
+}
 
-#ifdef DEBUG_MULTIOBJECTIVE_
-      std::cout << "[LOCALOPTIMIZER|INFO] Optimizer No-OP: either at warm-up or optimizer has completed\n";
-#endif
-      // set some random parametrization to collect at least 3 different
-      // vertices to be used as input to the optimizer
+void localoptimizer::reset_accumulated_measurements()
+{
+	pending_time = 0.;
+	pending_energy = 0.;
+	pending_threads = 0.;
+	pending_num_times = 0;
+}
 
-#if 1
-      float bucket_dt = steps_ / (float)warmup_steps_;
-      float _min_threads = max_threads_ * bucket_dt;
+actuation localoptimizer::step()
+{
 
-      act.delta_threads = rand() % (int)ceil(bucket_dt) + roundf(_min_threads);
+	steps_++;
+	actuation act;
+	act.delta_threads = threads_param_;
 #ifdef ALLSCALE_HAVE_CPUFREQ
-      float _min_freqs = frequencies_param_allowed_.size() * bucket_dt;
-      act.frequency_idx = rand() % (int)ceil(bucket_dt) + roundf(_min_freqs);
-#endif
+	act.frequency_idx = frequency_param_;
 #endif
-      goto validate_act;
-    }
-
-    // iterate over all objectives in decreasing priority
-    objective obj = objectives_[current_objective_idx_];
-
-    // initialize optimizer for this objective, if not already done so
-    if (!obj.initialized)
-    {
-#ifdef DEBUG_MULTIOBJECTIVE_
-      std::cout << "[LOCALOPTIMIZER|INFO] Initializing optimizer for new objective\n";
-      std::cout << "[LOCALOPTIMIZER|DEBUG] Samples: " << std::flush;
-      for (auto &sam : obj.samples)
-      {
-        std::cout << sam << "," << std::flush;
-      }
-      std::cout << "\n"
-                << std::flush;
-
-      std::cout << "[LOCALOPTIMIZER|DEBUG] Thread Param of Samples: " << std::flush;
-      for (auto &sam : obj.threads_samples)
-      {
-        std::cout << sam << "," << std::flush;
-      }
-      std::cout << "\n"
-                << std::flush;
-
+	/* random optimization step */
+	if (optmethod_ == random)
+	{
+		act.delta_threads = (rand() % max_threads_);
 #ifdef ALLSCALE_HAVE_CPUFREQ
-      std::cout << "[LOCALOPTIMIZER|DEBUG] Freq Param of Samples: " << std::flush;
-      for (auto &sam : obj.freq_samples)
-      {
-        std::cout << sam << "," << std::flush;
-      }
-      std::cout << "\n"
-                << std::flush;
+		act.frequency_idx = rand() % frequencies_param_allowed_.size();
+		// if (act.frequency_idx == frequency_param_)
+		//     act.frequency_idx = -1;
 #endif
-#endif
-      int samplenr = obj.samples.size();
+	}
 #ifdef ALLSCALE_HAVE_CPUFREQ
-      double params[3][2] = {
-          {obj.threads_samples[samplenr - 1], obj.freq_samples[samplenr - 1]},
-          {obj.threads_samples[samplenr - 2], obj.freq_samples[samplenr - 2]},
-          {obj.threads_samples[samplenr - 3], obj.freq_samples[samplenr - 3]},
-      };
-      double values[3] = {obj.samples[samplenr - 1], obj.samples[samplenr - 2], obj.samples[samplenr - 3]};
-      double min_threads = round(max_threads_ * 0.25);
-
-      if (min_threads < 1.0)
-        min_threads = 1.0;
-
-      double constraint_min[] = {min_threads, 0};
-      double constraint_max[] = {(double)max_threads_,
-                                 (double)frequencies_param_allowed_.size() - 1};
-      std::cout << "initialize_simplex::Initializing with " << frequencies_param_allowed_.size() << " frequencies" << std::endl;
-      nmd.initialize_simplex(params, values, constraint_min, constraint_max);
-      objectives_[current_objective_idx_].initialized = true;
-#endif
-    }
-
+	else if (optmethod_ == allscale)
+	{
+		if (steps_ <= warmup_steps_)
+		{
 #ifdef DEBUG_MULTIOBJECTIVE_
-    std::cout << "[LOCALOPTIMIZER|DEBG] Current Optimized Objective =";
-    switch (obj.type)
-    {
-    case energy:
-      std::cout << "********** Energy\n"
-                << std::flush;
-      break;
-    case time:
-      std::cout << "&&&&&&&&&& Time\n"
-                << std::flush;
-      break;
-    case resource:
-      std::cout << "oooooooooo Resource\n"
-                << std::flush;
-      break;
-    }
-    std::cout << "[LOCALOPTIMIZER|DEBUG] Samples: " << std::flush;
-    for (auto &sam : obj.samples)
-    {
-      std::cout << sam << "," << std::flush;
-    }
-    std::cout << "\n"
-              << std::flush;
-
-    std::cout << "[LOCALOPTIMIZER|DEBUG] Freq Param of Samples: " << std::flush;
-#ifdef ALLSCALE_HAVE_CPUFREQ
-    for (auto &sam : obj.freq_samples)
-    {
-      std::cout << sam << "," << std::flush;
-    }
-    std::cout << "\n"
-              << std::flush;
-#endif
-#endif
-
-    optstepresult nmd_res = nmd.step(obj.samples[0]);
+			std::cout << "[LOCALOPTIMIZER|INFO] Optimizer No-OP: either at warm-up or optimizer has completed\n";
+#endif
+			// set some random parametrization to collect at least 3 different
+			// vertices to be used as input to the optimizer
+
+			//VV: TODO Ensure that we don't pick the same 3 configurations
+			float bucket_dt = steps_ / (float)warmup_steps_;
+			float _min_threads = max_threads_ * bucket_dt;
+
+			act.delta_threads = rand() % (int)ceil(bucket_dt) + roundf(_min_threads);
+
+			float _min_freqs = frequencies_param_allowed_.size() * bucket_dt;
+			act.frequency_idx = rand() % (int)ceil(bucket_dt) + roundf(_min_freqs);
+
+			if (steps_ > 1)
+			{
+				accumulate_objective_measurements();
+				initialization_samples[steps_ - 2][0] = pending_time;
+				initialization_samples[steps_ - 2][1] = pending_energy;
+				initialization_samples[steps_ - 2][2] = pending_threads;
+				reset_accumulated_measurements();
+				initialization_params[steps_ - 2][0] = getCurrentThreads();
+
+			initialization_params[steps_ - 2][1] = getCurrentFrequencyIdx();
+
+			}
+			goto validate_act;
+		}
+
+		if (mo_initialized == false)
+			initialize_nmd();
+				
+		accumulate_objective_measurements();
+		const double latest_measurements[] = {pending_time, 
+											pending_energy, 
+											pending_threads};
+		reset_accumulated_measurements();
+
+		if ( explore_knob_domain ){
+			optstepresult nmd_res = nmd.step(latest_measurements);
 #ifdef DEBUG_MULTIOBJECTIVE_
-    std::cout << "[LOCALOPTIMIZER|DEBUG] Calling NMD Optimizer Step, Param = \n";
-    std::cout << "[LOCALOPTIMIZER|DEBUG] New Vertex to try: ";
-    std::cout << "Threads = " << nmd_res.threads;
-#ifdef ALLSCALE_HAVE_CPUFREQ
-    std::cout << " Freq Idx = " << nmd_res.freq_idx << std::endl;
-#endif
-    std::cout << "Converg Thresh = " << convergence_threshold_ << std::endl;
-#endif
-    if (nmd_res.converged)
-    {
-      objectives_[current_objective_idx_].converged = true;
-      objectives_[current_objective_idx_].converged_minimum = nmd.getMinObjective();
-      double *minimization_point = nmd.getMinVertices();
-      objectives_[current_objective_idx_].minimization_params[0] =
-          minimization_point[0];
-      objectives_[current_objective_idx_].minimization_params[1] =
-          minimization_point[1];
-#ifdef DEBUG_CONVERGENCE_
-      std::cout << "[LOCALOPTIMIZER|INFO] NMD convergence\n";
-      std::cout << "******************************************" << std::endl;
-      std::cout << "[LOCALOPTIMIZER|INFO] Minimal Objective Value = " << objectives_[current_objective_idx_].converged_minimum << "Threads = " << minimization_point[0] << "Freq_idx = " << minimization_point[1] << std::endl;
-      std::cout << "******************************************" << std::endl;
-#endif
-      act.delta_threads = minimization_point[0];
-#ifdef ALLSCALE_HAVE_CPUFREQ
-      act.frequency_idx = minimization_point[1];
+			std::cout << "[LOCALOPTIMIZER|DEBUG] New Vertex to try:";
+			std::cout << " Threads = " << nmd_res.threads;
+			std::cout << " Freq Idx = " << nmd_res.freq_idx << std::endl;
+			std::cout << " Converge Thresh = " << convergence_threshold_ << std::endl;
 #endif
-      current_objective_idx_++;
-      if (current_objective_idx_ == objectives_.size())
-      {
-        converged_ = true;
-#ifdef DEBUG_CONVERGENCE_
-        std::cout << "[LOCALOPTIMIZER|INFO] ALL OBJECTIVES HAVE CONVERGED " << std::endl;
-#endif
-      }
-    }
-    else
-    {
-#if 0
-      // if a higher priority objective starts getting off leeway margin,
-      // decide convergence of the current param at this parameter point
-      if (current_objective_idx_ > 0)
-        for (int i = 0; i < current_objective_idx_; i++)
-        {
-          objective priority_obj = objectives_[i];
-          double max_leeway_value = priority_obj.converged_minimum +
-                                    priority_obj.leeway * (priority_obj.globalmax - priority_obj.converged_minimum);
-          if (priority_obj.samples[0] > max_leeway_value &&
-              priority_obj.samples[1] > max_leeway_value)
-          {
-            objectives_[current_objective_idx_].converged = true;
-            objectives_[current_objective_idx_].converged_minimum = nmd.getMinObjective();
-            double *minimization_point = nmd.getMinVertices();
-            objectives_[current_objective_idx_].minimization_params[0] =
-                minimization_point[0];
-            objectives_[current_objective_idx_].minimization_params[1] =
-                minimization_point[1];
+			if (nmd_res.converged)
+			{
+				double min_score = nmd.getMinObjective();
+				double *minimization_point = nmd.getMinVertices();
 
 #ifdef DEBUG_CONVERGENCE_
-            std::cout << "[LOCALOPTIMIZER|INFO] Leeway convergence\n";
-            std::cout << "******************************************" << std::endl;
-            std::cout << "[LOCALOPTIMIZER|INFO] Minimal Objective Value = " << objectives_[current_objective_idx_].converged_minimum << "Threads = " << minimization_point[0] << "Freq_idx = " << minimization_point[1] << std::endl;
-            std::cout << "******************************************" << std::endl;
-#endif
-            // find the parameter point that scores the leeway margin value
-            act.delta_threads = (int)priority_obj.minimization_params[0] *
-                                (max_leeway_value / priority_obj.converged_minimum);
-#ifdef ALLSCALE_HAVE_CPUFREQ
-            act.frequency_idx = (int)priority_obj.minimization_params[1] *
-                                (max_leeway_value / priority_obj.converged_minimum);
-#endif
-            current_objective_idx_++;
-            if (current_objective_idx_ == objectives_.size())
-            {
-              converged_ = true;
-#ifdef DEBUG_CONVERGENCE_
-              std::cout << "[LOCALOPTIMIZER|INFO] ALL OBJECTIVES HAVE CONVERGED " << std::endl;
-#endif
-            }
-            act.delta_threads = (nmd_res.threads == 0) ? getCurrentThreads() : nmd_res.threads;
-#ifdef ALLSCALE_HAVE_CPUFREQ
-            act.frequency_idx = nmd_res.freq_idx;
-#endif
+				std::cout << "[LOCALOPTIMIZER|INFO] NMD convergence\n";
+				std::cout << "******************************************" << std::endl;
+				std::cout << "[LOCALOPTIMIZER|INFO] Minimal Objective Value = " << min_score << " Threads = " << minimization_point[0] << " Freq_idx = " << minimization_point[1] << std::endl;
+				std::cout << "******************************************" << std::endl;
+#endif
+				act.delta_threads = minimization_point[0];
+				act.frequency_idx = minimization_point[1];
+				// VV: Stop searching for new knob_set
+				explore_knob_domain = false;
+			} else {
+				// VV: Have not converged yet, keep exploring
+				act.delta_threads = nmd_res.threads;
+				act.frequency_idx = nmd_res.freq_idx;
+			}
+		}
+	}
+#endif // ALLSCALE_HAVE_CPUFREQ
 
-            goto validate_act;
-          }
-        }
-#else
-  act.delta_threads = nmd_res.threads;
-  act.frequency_idx = nmd_res.freq_idx;
-#endif
-    }
-  }
 validate_act:
 
-  if (act.delta_threads > max_threads_)
-  {
-    act.delta_threads = max_threads_;
-  }
-  else if (act.delta_threads < 1)
-  {
-    act.delta_threads = getCurrentThreads();
-  }
+	if (act.delta_threads > max_threads_)
+	{
+		act.delta_threads = max_threads_;
+	}
+	else if (act.delta_threads < 1)
+	{
+		act.delta_threads = getCurrentThreads();
+	}
 #ifdef ALLSCALE_HAVE_CPUFREQ
-  // VV: If freq_idx is -1 then set it to last used frequency (frequency_param_)
-  if (act.frequency_idx < 0)
-    act.frequency_idx = frequency_param_;
-  else if (act.frequency_idx > frequencies_param_allowed_.size() - 1)
-  {
-    act.frequency_idx = frequencies_param_allowed_.size() - 1;
-  }
+	// VV: If freq_idx is -1 then set it to last used frequency (frequency_param_)
+	if (act.frequency_idx < 0)
+		act.frequency_idx = frequency_param_;
+	else if (act.frequency_idx > frequencies_param_allowed_.size() - 1)
+		act.frequency_idx = frequencies_param_allowed_.size() - 1;
 #endif
-  return act;
+	return act;
 }
 } // namespace components
 } // namespace allscale
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index b96664a..414382c 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -14,12 +14,13 @@
 #define NMD_DEBUG_ 1
 #define NMD_INFO_ 1
 
-/* create the initial simplex
-
-   vector<doubl
-
-*/
-
+#ifdef NMD_DEBUG_
+#define OUT_DEBUG(X) X
+#else
+#define OUT_DEBUG(X) \
+    {                \
+    }
+#endif
 namespace allscale
 {
 namespace components
@@ -78,20 +79,140 @@ void NelderMead::my_constraints(double x[])
     x[1] = round(x[1]);
 }
 
+bool NelderMead::cache_update(int threads, int freq_idx,
+                              const double objectives[], bool add_if_new)
+{
+    auto key = std::make_pair(threads, freq_idx);
+    auto past = cache_.find(key);
+
+    if (past != cache_.end())
+    {
+        auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+        double abs_diff = 0;
+        for (auto j = 0; j < NMD_NUM_OBJECTIVES; ++j)
+        {
+            abs_diff += past->second.objectives[j] - objectives[j];
+            past->second.objectives[j] = objectives[j];
+        }
+
+        past->second._cache_timestamp = timestamp_now;
+        // VV: Entries which remain relatively same should be explored less frequently
+        if (abs_diff > 0.1)
+            past->second._cache_expires_dt = CACHE_EXPIRE_AFTER_MS;
+        else if (past->second._cache_expires_dt < CACHE_EXPIRE_AFTER_MS * 1024)
+            past->second._cache_expires_dt *= 2;
+
+        return true;
+    }
+    else if (add_if_new)
+    {
+        auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+        optstepresult entry;
+        entry._cache_timestamp = timestamp_now;
+        entry._cache_expires_dt = CACHE_EXPIRE_AFTER_MS;
+        entry.threads = threads;
+        entry.freq_idx = freq_idx;
+
+        for (auto j = 0; j < NMD_NUM_OBJECTIVES; ++j)
+            entry.objectives[j] = objectives[j];
+
+        cache_.insert(std::make_pair(key, entry));
+
+        return true;
+    }
+
+    return false;
+}
+
+double NelderMead::evaluate_score(const double objectives[], const double *weights) const
+{
+    double score = 0.0f;
+    // VV: [time, energy/power, resources]
+    double scale[] = {1.0, 1000.0, 1.0};
+    scale[2] = (double)constraint_max[0];
+
+    if (weights == nullptr)
+        weights = opt_weights;
+
+    for (auto i = 0; i < NMD_NUM_OBJECTIVES; ++i)
+    {
+        double t = objectives[i] / scale[i];
+        score += t * t * weights[i];
+    }
+
+    return score;
+}
+
+void NelderMead::set_weights(double weights[3])
+{
+    opt_weights[0] = weights[0];
+    opt_weights[1] = weights[1];
+    opt_weights[2] = weights[2];
+    OUT_DEBUG(
+        std::cout << "[NelderMead|DEBUG] Weights: " 
+                << opt_weights[0] << " "
+                << opt_weights[1] << " "
+                << opt_weights[2] << std::endl;
+    )
+}
+
 /* FIXME: generalize */
-void NelderMead::initialize_simplex(double params[][2], double values[], double constraint_min[], double constraint_max[])
+void NelderMead::initialize_simplex(double params[][2],
+                                    double objectives[][3],
+                                    double weights[3],
+                                    double constraint_min[2],
+                                    double constraint_max[2])
 {
     int i, j;
+    long timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
 
-    for (i = 0; i <= n; i++)
+    for (i = 0; i < NMD_NUM_KNOBS; i++)
+    {
+        this->constraint_min[i] = constraint_min[i];
+        this->constraint_max[i] = constraint_max[i];
+    }
+
+    set_weights(weights);
+
+    // VV: Need num_knobs +1
+    for (i = 0; i < NMD_NUM_KNOBS + 1; i++)
     {
+        f[i] = evaluate_score(objectives[i], weights);
+
         for (j = 0; j < n; j++)
         {
             v[i][j] = params[i][j];
         }
-        f[i] = values[i];
-        this->constraint_min[i] = constraint_min[i];
-        this->constraint_max[i] = constraint_max[i];
+
+        my_constraints(v[i]);
+
+        optstepresult entry;
+        entry.threads = round(v[i][0]);
+        entry.freq_idx = round(v[i][1]);
+
+        // VV: Check if we can re-use a previously explored configuration
+        auto key = std::make_pair(entry.threads, entry.freq_idx);
+
+        auto past_entry = cache_.find(std::make_pair(entry.threads,
+                                                     entry.freq_idx));
+        if (past_entry != cache_.end())
+        {
+            for (j = 0; j < NMD_NUM_OBJECTIVES; ++j)
+                past_entry->second.objectives[j] = objectives[i][j];
+
+            past_entry->second._cache_timestamp = timestamp_now;
+            // VV: Skip attempting to re-insert the "same" entry
+            continue;
+        }
+
+        // VV: If we've reached this point we need to add the entry to the cache
+        for (j = 0; j < NMD_NUM_OBJECTIVES; ++j)
+            entry.objectives[j] = objectives[i][j];
+
+        entry._cache_timestamp = timestamp_now;
+        entry._cache_expires_dt = CACHE_EXPIRE_AFTER_MS;
+
+        cache_.insert(std::make_pair(key, entry));
     }
     itr = 0;
 
@@ -103,13 +224,30 @@ void NelderMead::print_initial_simplex()
 {
     int i, j;
     std::cout << "[NelderMead DEBUG] Initial Values\n";
-    for (j = 0; j <= n; j++)
+    
+    for (j = 0; j < NMD_NUM_KNOBS + 1; j++)
     {
-        for (i = 0; i < n; i++)
+        
+        for (i = 0; i < NMD_NUM_KNOBS; i++)
         {
             std::cout << v[j][i] << ",";
         }
-        std::cout << " Objective value = " << f[j] << std::endl;
+        const int threads = (int) v[j][0];
+        const int freq_idx = (int) v[j][1];
+
+        auto e = cache_.find(std::make_pair(threads, freq_idx));
+        std::cout << " Objective value = " << f[j];
+
+        if ( e == cache_.end() )
+        {
+            std::cout << " (not in cache)" << std::endl;
+        } else {
+            std::cout << " OBJs: "
+                     << e->second.objectives[0] << " "
+                     << e->second.objectives[1] << " "
+                     << e->second.objectives[2] << " "
+                     << std::endl;
+        }
     }
 }
 
@@ -138,53 +276,6 @@ void NelderMead::print_iteration()
     std::cout << "[NelderMead DEBUG] f[vg]= " << f[vg] << ", vg = " << vg << std::endl;
 }
 
-/* find the index of the largest value */
-int NelderMead::vg_index()
-{
-    int j;
-    int vg = 0;
-
-    for (j = 0; j <= n; j++)
-    {
-        if (f[j] > f[vg])
-        {
-            vg = j;
-        }
-    }
-    return vg;
-}
-
-/* find the index of the smallest value */
-int NelderMead::vs_index()
-{
-    int j;
-    int vs = 0;
-
-    for (j = 0; j <= n; j++)
-    {
-        if (f[j] < f[vs])
-        {
-            vs = j;
-        }
-    }
-    return vs;
-}
-
-/* find the index of the second largest value */
-int NelderMead::vh_index()
-{
-    int j;
-
-    for (j = 0; j <= n; j++)
-    {
-        if (f[j] > f[vh] && f[j] < f[vg])
-        {
-            vh = j;
-        }
-    }
-    return vh;
-}
-
 /* calculate the centroid */
 void NelderMead::centroid()
 {
@@ -222,28 +313,82 @@ void NelderMead::sort_vertices()
 
     // VV: Find out what's the half-point by using a bitmap,
     //     when vg==vs that means that all points are equal
-    vh = 1 + 2 + 4 - (1 << vg) - (1 << vs);
-    vh = map_to_index[vh];
+    if (vg != vs)
+    {
+        vh = 1 + 2 + 4 - (1 << vg) - (1 << vs);
+        vh = map_to_index[vh];
+    }
+    else
+    {
+        vg = 2;
+        vh = 1;
+        vs = 0;
+    }
 }
 
-optstepresult NelderMead::do_step_start(double param)
+bool NelderMead::knob_set_exists(double knobs[2], int exclude)
+{
+    int is_same;
+
+    for (auto i=0; i<NMD_NUM_KNOBS+1; ++i) {
+        if ( i != exclude ) {
+            is_same = 1;
+            for ( auto j=0; j<NMD_NUM_KNOBS; ++j ) 
+                is_same &= (v[i][j] == knobs[j]);
+            
+            if ( is_same )
+                return true;
+        }
+    }
+
+    return false;
+}
+
+optstepresult NelderMead::do_step_start()
 {
     optstepresult res;
 
     itr++;
-#ifdef NMD_DEBUG_
-    std::cout << "[NelderMead DEBUG] State = Start" << std::endl;
-    print_initial_simplex();
-#endif
+    OUT_DEBUG(
+        std::cout << "[NelderMead DEBUG] State = Start" << std::endl;
+        print_initial_simplex();)
+
     sort_vertices();
 
     centroid();
+    double extra[2] = {0.0, 0.0};
+    int is_invalid = 0;
+    int max_combinations = 0;
+
+    max_combinations = (constraint_max[0] - constraint_min[0]+1) * (constraint_max[1] - constraint_min[1]+1);
+
+
+    // VV: Try not to pick a knob_set that already exists in `v`
+    do {
+        for (j = 0; j < NMD_NUM_KNOBS; j++)
+            vr[j] = vm[j] + ALPHA * (vm[j] - v[vg][j]) + extra[j];
+        
+        my_constraints(vr);
+        
+        is_invalid = 0;
+
+        if ( max_combinations > NMD_NUM_KNOBS +1 ) {
+            is_invalid = knob_set_exists(vr, -1);
+
+            if ( is_invalid ) {
+                extra[0] = rand() % (int)(constraint_max[0] - constraint_min[0])
+                            + (int) constraint_min[0]
+                            - (int)(0.5*(constraint_max[0] - constraint_min[0]));
+
+                extra[1] = rand() % (int)(constraint_max[1] - constraint_min[1])
+                            + (int) constraint_min[1]
+                            - (int)(0.5*(constraint_max[1] - constraint_min[1]));
+                
+            }
+        } 
+        
+    } while ( is_invalid );
 
-    for (j = 0; j <= n - 1; j++)
-    {
-        vr[j] = vm[j] + ALPHA * (vm[j] - v[vg][j]);
-    }
-    my_constraints(vr);
 #ifdef NMD_DEBUG_
     std::cout << "[NelderMead DEBUG] Reflection Parameter = ("
               << vr[0] << "," << vr[1] << ")"
@@ -254,21 +399,34 @@ optstepresult NelderMead::do_step_start(double param)
     res.threads = vr[0];
     res.freq_idx = vr[1];
 
+    auto key = std::make_pair(res.threads, res.freq_idx);
+
+    auto entry = cache_.find(key);
+
+    if (entry != cache_.end())
+    {
+        auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+        auto dt = timestamp_now - entry->second._cache_timestamp;
+
+        if (dt < entry->second._cache_expires_dt)
+        {
+            return do_step_reflect(entry->second.objectives);
+        }
+    }
+
     return res;
 }
 
-optstepresult NelderMead::do_step_reflect(double param)
+optstepresult NelderMead::do_step_reflect(const double objectives[])
 {
     optstepresult res;
 #ifdef NMD_DEBUG_
     std::cout << "[NelderMead DEBUG] State = Reflection" << std::endl;
 #endif
-    fr = param;
+    fr = evaluate_score(objectives, opt_weights);
 
-    std::cout << "fr:" << fr << " f[vh]:" << f[vh]
-              << " f[vs]:" << f[vs] << std::endl;
-
-    if ( (f[vs] <= fr) && (fr < f[vh]) ) {
+    if ((f[vs] <= fr) && (fr < f[vh]))
+    {
         // VV: REFLECTED point is better than the SECOND BEST
         //     but NOT better than the BEST
         //     Replace WORST point with REFLECTED
@@ -276,30 +434,112 @@ optstepresult NelderMead::do_step_reflect(double param)
         {
             v[vg][j] = vr[j];
         }
+
+        my_constraints(v[vg]);
+
         f[vg] = fr;
+
+        const int threads = (int)(v[vg][0]);
+        const int freq_idx = (int)(v[vg][1]);
+
+        cache_update(threads, freq_idx, objectives, true);
+
         state_ = start;
-        return do_step_start(param);
-    } else if ( fr < f[vs] ) {
+        return do_step_start();
+    }
+    else if (fr < f[vs])
+    {
         // VV: REFLECTED is better than BEST
-        
-        for ( j=0; j<=n-1; ++j)
-            ve[j] = vm[j] + GAMMA * (vr[j] - vm[j]);
-        
-        my_constraints(ve);
+
+        double extra[2] = {0.0, 0.0};
+        int is_invalid = 0;
+        int max_combinations = 0;
+
+        max_combinations = (constraint_max[0] - constraint_min[0]+1) * (constraint_max[1] - constraint_min[1]+1);
+
+        // VV: Try not to pick a knob_set that already exists in `v`
+        do {
+            for (j = 0; j < NMD_NUM_KNOBS; j++)
+                ve[j] = vm[j] + GAMMA * (vr[j] - vm[j]) + extra[j];
+            
+            my_constraints(ve);
+            
+            is_invalid = 0;
+
+            if ( max_combinations > NMD_NUM_KNOBS +1 ) {
+                is_invalid = knob_set_exists(ve, -1);
+
+                if ( is_invalid ) {
+                    extra[0] = rand() % (int)(constraint_max[0] - constraint_min[0])
+                                + (int) constraint_min[0]
+                                - (int)(0.5*(constraint_max[0] - constraint_min[0]));
+
+                    extra[1] = rand() % (int)(constraint_max[1] - constraint_min[1])
+                                + (int) constraint_min[1]
+                                - (int)(0.5*(constraint_max[1] - constraint_min[1]));
+                    
+                }
+            } 
+            
+        } while ( is_invalid );
+
         // VV: Now evaluate EXPANDED
         res.threads = ve[0];
         res.freq_idx = ve[1];
 
         state_ = expansion;
 
+        auto key = std::make_pair(res.threads, res.freq_idx);
+
+        auto entry = cache_.find(key);
+
+        if (entry != cache_.end())
+        {
+            auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+            auto dt = timestamp_now - entry->second._cache_timestamp;
+
+            if (dt < entry->second._cache_expires_dt)
+            {
+                return do_step_expand(entry->second.objectives);
+            }
+        }
+
         return res;
-    } else if ( (f[vh] <= fr) && (fr < f[vg])) {
+    }
+    else if ((f[vh] <= fr) && (fr < f[vg]))
+    {
         // VV: REFLECTED between SECOND BEST and WORST
-        
-        for ( j=0; j<=n-1; ++j)
-            vc[j] = vm[j] + BETA * (vr[j] - vm[j]);
-        
-        my_constraints(vc);
+        double extra[2] = {0.0, 0.0};
+        int is_invalid = 0;
+        int max_combinations = 0;
+
+        max_combinations = (constraint_max[0] - constraint_min[0]+1) * (constraint_max[1] - constraint_min[1]+1);
+
+        // VV: Try not to pick a knob_set that already exists in `v`
+        do {
+            for (j = 0; j < NMD_NUM_KNOBS; j++)
+                vc[j] = vm[j] + BETA * (vr[j] - vm[j]) + extra[j];
+            
+            my_constraints(vc);
+            
+            is_invalid = 0;
+
+            if ( max_combinations > NMD_NUM_KNOBS +1 ) {
+                is_invalid = knob_set_exists(vc, -1);
+
+                if ( is_invalid ) {
+                    extra[0] = rand() % (int)(constraint_max[0] - constraint_min[0])
+                                + (int) constraint_min[0]
+                                - (int)(0.5*(constraint_max[0] - constraint_min[0]));
+
+                    extra[1] = rand() % (int)(constraint_max[1] - constraint_min[1])
+                                + (int) constraint_min[1]
+                                - (int)(0.5*(constraint_max[1] - constraint_min[1]));
+                    
+                }
+            } 
+            
+        } while ( is_invalid );
 
         // VV: Now evaluate EXPANDED
         res.threads = vc[0];
@@ -307,30 +547,88 @@ optstepresult NelderMead::do_step_reflect(double param)
 
         state_ = contraction;
 
+        auto key = std::make_pair(res.threads, res.freq_idx);
+
+        auto entry = cache_.find(key);
+
+        if (entry != cache_.end())
+        {
+            auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+            auto dt = timestamp_now - entry->second._cache_timestamp;
+
+            if (dt < entry->second._cache_expires_dt)
+            {
+                return do_step_contract(entry->second.objectives);
+            }
+        }
+
         return res;
-    } else {
+    }
+    else
+    {
         // VV: REFLECTED worse than WORST
-        for ( j=0; j<=n-1; ++j)
-            vc[j] = vm[j] - BETA * (vr[j] - vm[j]);
-        
-        my_constraints(vc);
+        double extra[2] = {0.0, 0.0};
+        int is_invalid = 0;
+        int max_combinations = 0;
+
+        max_combinations = (constraint_max[0] - constraint_min[0]+1) * (constraint_max[1] - constraint_min[1]+1);
+
+        // VV: Try not to pick a knob_set that already exists in `v`
+        do {
+            for (j = 0; j < NMD_NUM_KNOBS; j++)
+                vc[j] = vm[j] - BETA * (vr[j] - vm[j]) + extra[j];
+            
+            my_constraints(vc);
+            
+            is_invalid = 0;
+
+            if ( max_combinations > NMD_NUM_KNOBS +1 ) {
+                is_invalid = knob_set_exists(vc, -1);
+
+                if ( is_invalid ) {
+                    extra[0] = rand() % (int)(constraint_max[0] - constraint_min[0])
+                                + (int) constraint_min[0]
+                                - (int)(0.5*(constraint_max[0] - constraint_min[0]));
+
+                    extra[1] = rand() % (int)(constraint_max[1] - constraint_min[1])
+                                + (int) constraint_min[1]
+                                - (int)(0.5*(constraint_max[1] - constraint_min[1]));
+                    
+                }
+            } 
+            
+        } while ( is_invalid );
 
         // VV: Now evaluate EXPANDED
         res.threads = vc[0];
         res.freq_idx = vc[1];
 
         state_ = contraction;
+        auto key = std::make_pair(res.threads, res.freq_idx);
+
+        auto entry = cache_.find(key);
+
+        if (entry != cache_.end())
+        {
+            auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+            auto dt = timestamp_now - entry->second._cache_timestamp;
+
+            if (dt < entry->second._cache_expires_dt)
+            {
+                return do_step_contract(entry->second.objectives);
+            }
+        }
 
         return res;
     }
 }
 
-optstepresult NelderMead::do_step_expand(double param)
+optstepresult NelderMead::do_step_expand(const double objectives[])
 {
 #ifdef NMD_DEBUG_
     std::cout << "[NelderMead DEBUG] State = Expansion" << std::endl;
 #endif
-    fe = param;
+    fe = evaluate_score(objectives, nullptr);
 
     if (fe < fr)
     {
@@ -353,19 +651,23 @@ optstepresult NelderMead::do_step_expand(double param)
     }
 
     state_ = start;
-    
-    return do_step_start(param);
+    const int threads = (int)(v[vg][0]);
+    const int freq_idx = (int)(v[vg][1]);
+
+    cache_update(threads, freq_idx, objectives, true);
+    return do_step_start();
 }
 
-optstepresult NelderMead::do_step_contract(double param)
+optstepresult NelderMead::do_step_contract(const double objectives[])
 {
     int j;
 #ifdef NMD_DEBUG_
     std::cout << "[NelderMead|DEBUG] State = Contraction" << std::endl;
 #endif
-    fc = param;
+    fc = evaluate_score(objectives, nullptr);
 
-    if ( fc <= fr ) {
+    if (fc <= fr)
+    {
         // VV: CONTRACTED_O is better than REFLECTED
         //     Replace WORST with CONTRACTED_O
         for (j = 0; j <= n - 1; j++)
@@ -374,58 +676,122 @@ optstepresult NelderMead::do_step_contract(double param)
         }
         f[vg] = fc;
 
-        return do_step_start(param);
-    } else {
+        const int threads = (int)(v[vg][0]);
+        const int freq_idx = (int)(v[vg][1]);
+
+        cache_update(threads, freq_idx, objectives, true);
+        return do_step_start();
+    }
+    else
+    {
         // VV: Replace SECOND BEST
-        for (j = 0; j <= n - 1; j++)
-            v[vh][j] = v[vs][j] + DELTA * (v[vh][j] - v[vs][j]);
-        
-        my_constraints(v[vh]);
+        double new_vh[NMD_NUM_KNOBS];
+        double extra[NMD_NUM_KNOBS] = {0.0, 0.0};
+        int is_invalid = 0;
+        int max_combinations = 0;
+
+        max_combinations = (constraint_max[0] - constraint_min[0]+1) * (constraint_max[1] - constraint_min[1]+1);
+
+        // VV: Try not to pick a knob_set that already exists in `v`
+        do {
+            for (j = 0; j < NMD_NUM_KNOBS; j++)
+                new_vh[j] = v[vs][j] + DELTA * (v[vh][j] - v[vs][j]) + extra[j];
+            
+            my_constraints(new_vh);
+            
+            is_invalid = 0;
+
+            if ( max_combinations > NMD_NUM_KNOBS +1 ) {
+                is_invalid = knob_set_exists(new_vh, -1);
+
+                if ( is_invalid ) {
+                    extra[0] = rand() % (int)(constraint_max[0] - constraint_min[0])
+                                + (int) constraint_min[0]
+                                - (int)(0.5*(constraint_max[0] - constraint_min[0]));
+
+                    extra[1] = rand() % (int)(constraint_max[1] - constraint_min[1])
+                                + (int) constraint_min[1]
+                                - (int)(0.5*(constraint_max[1] - constraint_min[1]));
+                    
+                }
+            } 
+            
+        } while ( is_invalid );
+
+        for (j = 0; j < NMD_NUM_KNOBS; j++)
+            v[vh][j] = new_vh[j];
+
         // VV: Now evaluate SHRINK
 
         optstepresult res;
         res.threads = v[vh][0];
         res.freq_idx = v[vh][1];
         state_ = shrink;
+
+        auto key = std::make_pair(res.threads, res.freq_idx);
+
+        auto entry = cache_.find(key);
+
+        if (entry != cache_.end())
+        {
+            auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+            auto dt = timestamp_now - entry->second._cache_timestamp;
+
+            if (dt < entry->second._cache_expires_dt)
+            {
+                return do_step_shrink(entry->second.objectives);
+            }
+        }
+
         return res;
     }
 }
 
-optstepresult NelderMead::do_step_shrink(double param)
+optstepresult NelderMead::do_step_shrink(const double objectives[])
 {
 #ifdef NMD_DEBUG_
     std::cout << "[NelderMead|DEBUG] State = Shrink" << std::endl;
 #endif
-    f[vh] = param;
-    return do_step_start(param);
+    f[vh] = evaluate_score(objectives, nullptr);
+
+    const int threads = (int)(v[vh][0]);
+    const int freq_idx = (int)(v[vh][1]);
+
+    cache_update(threads, freq_idx, objectives, true);
+
+    return do_step_start();
 }
 
-optstepresult NelderMead::step(double param)
+optstepresult NelderMead::step(const double objectives[])
 {
     int i, j;
 
     optstepresult res;
     res.threads = 0;
     res.freq_idx = -1;
-
+    std::cout << "Starting step with "
+                << objectives[0] << " " 
+                << objectives[1] << " " 
+                << objectives[2] << std::endl;
+    
     switch (state_)
     {
 
     case start:
-        res = do_step_start(param);
-    break;
+        res = do_step_start();
+        break;
     case reflection:
-        res = do_step_reflect(param);
-    break;
+        res = do_step_reflect(objectives);
+        break;
     case expansion:
-        res = do_step_expand(param);
-    break;
+        res = do_step_expand(objectives);
+        break;
     case contraction:
-        res = do_step_contract(param);
-    break;
+        res = do_step_contract(objectives);
+        break;
     case shrink:
-        res = do_step_shrink(param);
-    break;
+        res = do_step_shrink(objectives);
+        break;
     default:
         std::cout << "Unknown NelderMead state (" << state_ << ")" << std::endl;
         res.converged = false;
@@ -434,11 +800,16 @@ optstepresult NelderMead::step(double param)
 
     res.converged = testConvergence();
 
-    if ( res.converged == true ) {
+    if (res.converged == true)
+    {
         res.threads = v[vs][0];
         res.freq_idx = v[vs][1];
         std::cout << "Converged to " << res.threads << " " << res.freq_idx << std::endl;
     }
+    std::cout << "Stop step with "
+                << objectives[0] << " " 
+                << objectives[1] << " " 
+                << objectives[2] << std::endl;
 
     return res;
 }
@@ -446,15 +817,31 @@ optstepresult NelderMead::step(double param)
 bool NelderMead::testConvergence()
 {
     double temp;
+    #if 0
+    int all_same = 1;
+
+    for (auto i = 0; i <= n; ++i)
+    {
+        for (auto k = i + 1; j <= n; ++k)
+            for (auto j = 0; j < n; ++j)
+                all_same &= (v[i][j] == v[k][j]);
+    }
+
+    if (all_same)
+    {
+        min = f[vs];
+        return true;
+    }
+    #endif
 
     fsum = 0.0;
-    for (j = 0; j <= n; j++)
+    for (auto j = 0; j <= n; j++)
     {
         fsum += f[j];
     }
     favg = fsum / (n + 1);
     s = 0.0;
-    for (j = 0; j <= n; j++)
+    for (auto j = 0; j <= n; j++)
     {
         temp = (f[j] - favg);
         s += temp * temp / (n);
@@ -469,56 +856,11 @@ bool NelderMead::testConvergence()
         return false;
     else
     {
-        vs = vs_index();
+        sort_vertices();
         min = f[vs];
         return true;
     }
 }
 
-void NelderMead::updateObjectives()
-{
-    /* re-evaluate all the vertices */
-    /*for (j=0;j<=n;j++) {
-                  f[j] = objfunc(v[j]);
-                  }
-                  */
-
-    /* find the index of the largest value */
-    vg = vg_index();
-
-    /* find the index of the smallest value */
-    vs = vs_index();
-
-    /* find the index of the second largest value */
-    vh = vh_index();
-
-    my_constraints(v[vg]);
-
-    //f[vg] = objfunc(v[vg]);
-
-    my_constraints(v[vh]);
-
-    //f[vh] = objfunc(v[vh]);
-}
-
 } // namespace components
 } // namespace allscale
-/*
-
-       std::vector<double> NelderMead::minimum(){
-
-
-       free(f);
-       free(vr);
-       free(ve);
-       free(vc);
-       free(vm);
-       for (i=0;i<=n;i++) {
-       free (v[i]);
-       }
-       free(v);
-       return min;
-
-
-       }
-       */
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index e2b7df9..3b4aaba 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -223,8 +223,6 @@ void scheduler::init() {
         )
     );
 
-//   std::cout << "init: " << num_cores << " " << allscale::get_num_localities() << " " << depth_cut_off_ << '\n';
-
   // Reading user provided options in terms of desired optimization objectives
   std::string input_objective_str =
       hpx::get_config_entry("allscale.objective", "");
@@ -232,18 +230,24 @@ void scheduler::init() {
   /* Read optimization policy selected by the user. If not specified,
      allscale policy is the default */
     std::string input_optpolicy_str =
-      hpx::get_config_entry("allscale.policy", "allscale");
+      hpx::get_config_entry("allscale.policy", "none");
 #ifdef DEBUG_MULTIOBJECTIVE_
     std::cout << "[Local Optimizer|INFO] Optimization Policy Active = " << input_optpolicy_str << std::endl;
 #endif
-    if (input_optpolicy_str=="allscale")
-      lopt_.setPolicy(allscale);
-    else if (input_optpolicy_str=="random")
+#if ALLSCALE_HAVE_CPUFREQ
+    if (input_optpolicy_str=="allscale") {
+		lopt_.setPolicy(allscale);
+	}
+    else 
+#endif
+	if (input_optpolicy_str=="random")
       lopt_.setPolicy(random);
     else if (input_optpolicy_str=="manual")
       lopt_.setPolicy(manual);
-    else lopt_.setPolicy(allscale);
-
+	else if ( input_optpolicy_str != "none" ) {
+		HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init", 
+							"unknown allscale.policy");
+	}
 #ifdef MEASURE_MANUAL_
   std::string input_osthreads_str =
       hpx::get_config_entry("allscale.osthreads", "");

From 67ee0c16d1af5bd01fd27b94deb5fbbb76276c59 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Wed, 14 Nov 2018 14:21:41 +0000
Subject: [PATCH 06/37] Moved warmup stage in NMD algorithm

---
 allscale/components/localoptimizer.hpp   |  17 +-
 allscale/components/nmsimplex_bbincr.hpp | 245 +++++------
 allscale/components/scheduler.hpp        |   4 +-
 src/components/localoptimizer.cpp        |  93 ++---
 src/components/nmsimplex_bbincr.cpp      | 495 +++++++++++++----------
 src/components/scheduler_component.cpp   |   2 +-
 6 files changed, 456 insertions(+), 400 deletions(-)

diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index d9799cc..4b2d1ce 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -83,7 +83,7 @@ namespace allscale { namespace components {
     struct localoptimizer
     {
         localoptimizer()
-            :nmd(0.01),
+            :nmd(convergence_threshold_),
             pending_threads(0.),
             pending_energy(0.),
             pending_time(0.),
@@ -142,10 +142,7 @@ namespace allscale { namespace components {
             return max_threads_;
         }
 
-        void setmaxthreads(std::size_t threads){
-            max_threads_=threads;
-            threads_param_=threads;
-        }
+        void setmaxthreads(std::size_t threads);
 
         /* executes one step of multi-objective optimization */
         actuation step();
@@ -178,6 +175,9 @@ namespace allscale { namespace components {
         }
 
     private:
+        // VV: Used to convert thread_idx to actual number of threads
+        std::size_t threads_dt;
+
         void accumulate_objective_measurements();
         void reset_accumulated_measurements();
 
@@ -188,9 +188,6 @@ namespace allscale { namespace components {
 
         bool explore_knob_domain;
         
-        double initialization_samples[NMD_NUM_KNOBS+1][NMD_NUM_OBJECTIVES];
-        double initialization_params[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS];
-
         double pending_time, pending_energy, pending_threads;
         unsigned long pending_num_times;
 
@@ -234,8 +231,8 @@ namespace allscale { namespace components {
 #endif
 
         /* threshold (percentage in [0,1]) to decide convergence of optimization
-           steps against a single objective */
-        const double convergence_threshold_ = 0.02;
+           steps */
+        const double convergence_threshold_ = 0.01;
 
         /***** optimization state variables ******/
 
diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp
index e87fe8c..2674517 100644
--- a/allscale/components/nmsimplex_bbincr.hpp
+++ b/allscale/components/nmsimplex_bbincr.hpp
@@ -38,29 +38,38 @@ namespace components
 // VV: time, energy/power, resources
 #define NMD_NUM_OBJECTIVES 3
 
+
+#if (NMD_NUM_OBJECTIVES != 3)
+#error UNSUPPORTED number of Objectives
+#endif
+
+#if (NMD_NUM_KNOBS != 2)
+#error UNSUPPORTED number of Knobs
+#endif
+
 #define MAX_IT 1000 /* maximum number of iterations */
 #define ALPHA 1.0   /* reflection coefficient */
-#define BETA 0.5    /* contraction coefficient */
+#define BETA 0.5	/* contraction coefficient */
 #define GAMMA 2.0   /* expansion coefficient */
 #define DELTA 0.5   /* shrinking coefficient */
 
-#define CACHE_EXPIRE_AFTER_MS 5000
+#define CACHE_EXPIRE_AFTER_MS 35000
 
 /* structure type of a single optimization step return status */
 struct optstepresult
 {
-  /* true if optimization has converged for the specified objective */
-  bool converged;
-  /* number of threads for parameters to set for sampling */
-  double threads;
-  /* index to frequency vector for freq parameter to set for sampling*/
-  int freq_idx;
-
-  /******VV: Cache stuff******/
-  double score;
-  double objectives[3]; // (time, energy, resource)
-  // VV: _cache_expires denotes dt (in ms) after _cache_timestamp
-  int64_t _cache_timestamp, _cache_expires_dt;
+	/* true if optimization has converged for the specified objective */
+	bool converged;
+	/* number of threads for parameters to set for sampling */
+	double threads;
+	/* index to frequency vector for freq parameter to set for sampling*/
+	int freq_idx;
+
+	/******VV: Cache stuff******/
+	double score;
+	double objectives[3]; // (time, energy, resource)
+	// VV: _cache_expires denotes dt (in ms) after _cache_timestamp
+	int64_t _cache_timestamp, _cache_expires_dt;
 };
 
 typedef std::map<std::pair<int, int>, optstepresult> MapCache_t;
@@ -68,143 +77,153 @@ typedef std::map<std::pair<int, int>, optstepresult> MapCache_t;
 /* enumeration encoding state that the incremental Nelder Mead optimizer is at */
 enum iterationstates
 {
-  start,
-  reflection,
-  expansion,
-  contraction,
-  shrink
+	// VV: Need NMD_NUM_KNOBS + 1 values before we can start optimizing
+	warmup,
+	start,
+	reflection,
+	expansion,
+	contraction,
+	shrink
 };
 
+
 class NelderMead
 {
 
-public:
-  NelderMead(double);
-  // VV: For the time being params = [threads, freq_idx]
-  //     objectives = [time, energy/power, resources]
-  //     weights = [ W_time, W_energy/power, W_resources ]
-  //     constraint_min = [min_threads, min_freq_idx]
-  void initialize_simplex(double params[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS],
-                          double objectives[][NMD_NUM_OBJECTIVES],
-                          double weights[NMD_NUM_OBJECTIVES],
-                          double constraint_min[NMD_NUM_KNOBS],
-                          double constraint_max[NMD_NUM_KNOBS]);
-  void print_initial_simplex();
-  void print_iteration();
-  
-  double *getMinVertices()
-  {
-    return v[vs];
-  }
+  public:
+	NelderMead(double);
+	// VV: For the time being 
+	//     weights = [ W_time, W_energy/power, W_resources ]
+	//     constraint_min = [min_threads, min_freq_idx]
+	void initialize_simplex(double weights[NMD_NUM_OBJECTIVES],
+							double constraint_min[NMD_NUM_KNOBS],
+							double constraint_max[NMD_NUM_KNOBS]);
+
+	void print_initial_simplex();
+	void print_iteration();
+
+	double *getMinVertices()
+	{
+		return v[vs];
+	}
+
+	double getMinObjective()
+	{
+		return min;
+	}
+
+	unsigned long int getIterations() { return itr; }
+	double evaluate_score(const double objectives[], const double *weights) const;
+	void set_weights(double weights[]);
+
+	optstepresult step(const double objectives[]);
+
+  private:
+	int warming_up_step;
 
-  double getMinObjective()
-  {
-    return min;
-  }
+	// VV: Utility to make sure that we generate new values and not something that already
+	//     exists in the set of NMD_NUM_KNOBS+1 configuration points
+	template <typename F>
+	void generate_new(F &gen);
+	enum direction {up, up_final, down, left, right, right_final};
+	std::pair<int, direction> explore_next_extra(double *extra, int level, 
+                        direction dir, int level_max, int level_nested_max);
 
-  unsigned long int getIterations() { return itr; }
-  double evaluate_score(const double objectives[], const double *weights) const;
-  void set_weights(double weights[]);
+	//VV: objective_type: { <threads, cpu-freq>: optstepresult }
+	MapCache_t cache_;
 
-  optstepresult step(const double objectives[]);
-private:
-  //VV: objective_type: { <threads, cpu-freq>: optstepresult }
-  MapCache_t cache_;
-  
-  optstepresult do_step_start();
-  optstepresult do_step_reflect(const double objectives[]);
-  optstepresult do_step_expand(const double objectives[]);
-  optstepresult do_step_contract(const double objectives[]);
-  optstepresult do_step_shrink(const double objectives[]);
+	optstepresult do_step_start();
+	optstepresult do_step_reflect(const double objectives[]);
+	optstepresult do_step_expand(const double objectives[]);
+	optstepresult do_step_contract(const double objectives[]);
+	optstepresult do_step_shrink(const double objectives[]);
 
-  bool knob_set_exists(double knobs[2], int exclude);
+	void sort_vertices(void);
+	void my_constraints(double *);
+	void centroid();
+	bool testConvergence(std::size_t tested_combinations);
 
-  void sort_vertices(void);
-  void my_constraints(double *);
-  void centroid();
-  bool testConvergence();
+	// VV: Will return false if entry not in cache
+	bool cache_update(int threads, int freq_idx,
+					  const double objectives[],
+					  bool add_if_new);
 
-  // VV: Will return false if entry not in cache
-  bool cache_update(int threads, int freq_idx, 
-                    const double objectives[],
-                    bool add_if_new);
+	double round2(double num, int precision)
+	{
+		double rnum = 0.0;
+		int tnum;
 
-  double round2(double num, int precision)
-  {
-    double rnum = 0.0;
-    int tnum;
+		if (num == 0.0)
+			return num;
 
-    if (num == 0.0)
-      return num;
+		rnum = num * pow(10, precision);
+		tnum = (int)(rnum < 0 ? rnum - 0.5 : rnum + 0.5);
+		rnum = tnum / pow(10, precision);
 
-    rnum = num * pow(10, precision);
-    tnum = (int)(rnum < 0 ? rnum - 0.5 : rnum + 0.5);
-    rnum = tnum / pow(10, precision);
+		return rnum;
+	}
 
-    return rnum;
-  }
+	/* vertex with smallest value */
+	int vs;
 
-  /* vertex with smallest value */
-  int vs;
+	/* vertex with next smallest value */
+	int vh;
 
-  /* vertex with next smallest value */
-  int vh;
+	/* vertex with largest value */
+	int vg;
 
-  /* vertex with largest value */
-  int vg;
+	int i, j, row;
 
-  int i, j, row;
-  
-  const int n = 2;
+	const int n = 2;
 
-  /* track the number of function evaluations */
-  int k;
+	/* track the number of function evaluations */
+	int k;
 
-  /* track the number of iterations */
-  int itr;
+	/* track the number of iterations */
+	int itr;
 
-  /* holds vertices of simplex */
-  double **v;
+	/* holds vertices of simplex */
+	double **v;
 
-  /* value of function at each vertex */
-  double *f;
+	/* value of function at each vertex */
+	double *f;
 
-  /* value of function at reflection point */
-  double fr;
+	/* value of function at reflection point */
+	double fr;
 
-  /* value of function at expansion point */
-  double fe;
+	/* value of function at expansion point */
+	double fe;
 
-  /* value of function at contraction point */
-  double fc;
+	/* value of function at contraction point */
+	double fc;
 
-  /* reflection - coordinates */
-  double *vr;
+	/* reflection - coordinates */
+	double *vr;
 
-  /* expansion - coordinates */
-  double *ve;
+	/* expansion - coordinates */
+	double *ve;
 
-  /* contraction - coordinates */
-  double *vc;
+	/* contraction - coordinates */
+	double *vc;
 
-  /* centroid - coordinates */
-  double *vm;
+	/* centroid - coordinates */
+	double *vm;
 
-  double min;
+	double min;
 
-  double fsum, favg, s;
+	double fsum, favg, s;
 
-  double EPSILON;
+	double EPSILON;
 
-  iterationstates state_;
+	iterationstates state_;
 
-  const int MAXITERATIONS = 15;
+	const int MAXITERATIONS = 15;
 
-  double constraint_min[2];
+	double constraint_min[2];
 
-  double constraint_max[2];
+	double constraint_max[2];
 
-  double opt_weights[NMD_NUM_OBJECTIVES];
+	double opt_weights[NMD_NUM_OBJECTIVES];
 };
 
 } // namespace components
diff --git a/allscale/components/scheduler.hpp b/allscale/components/scheduler.hpp
index c508900..0980207 100644
--- a/allscale/components/scheduler.hpp
+++ b/allscale/components/scheduler.hpp
@@ -110,7 +110,7 @@ namespace allscale { namespace components {
         long last_optimization_timestamp_;
 
         /* periodicity in milliseconds to invoke the optimizer */
-        const long optimization_period_ms = 5000;
+        const long optimization_period_ms = 1000;
 
         /* captures absolute timestamp of the last time optimization
            objective value have been measured (sampled) */
@@ -119,7 +119,7 @@ namespace allscale { namespace components {
         long last_objective_measurement_timestamp_;
 
         /* periodicity in milliseconds to invoke objective sampling */
-        const long objective_measurement_period_ms = 1000;
+        const long objective_measurement_period_ms = 500;
 
         //extra masks to better handle suspending/resuming threads
         std::vector<hpx::threads::thread_pool_base*> thread_pools_;
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index 04ef472..14158fa 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -24,6 +24,7 @@ namespace allscale
 {
 namespace components
 {
+	#if 0
 localoptimizer::localoptimizer(std::list<objective> targetobjectives)
 	: objectives_((int)targetobjectives.size()),
 	  nmd(convergence_threshold_),
@@ -49,6 +50,7 @@ localoptimizer::localoptimizer(std::list<objective> targetobjectives)
 	setCurrentFrequencyIdx(0);
 #endif
 };
+#endif
 
 void localoptimizer::setobjectives(std::list<objective> targetobjectives)
 {
@@ -153,42 +155,47 @@ void localoptimizer::accumulate_objective_measurements()
 	if (pending_num_times)
 	{
 		pending_time /= (double)pending_num_times;
-		pending_threads /= (double)pending_num_times;
+		pending_threads /= (double)(pending_num_times*threads_dt);
 		pending_energy /= (double)pending_num_times;
 		pending_num_times = 0;
 	}
 }
 
-#ifdef ALLSCALE_HAVE_CPUFREQ
-void localoptimizer::initialize_nmd()
+void localoptimizer::setmaxthreads(std::size_t threads)
 {
-	// VV: Retrieve measurements for last exploration
-	if ( steps_ == warmup_steps_ +1 )
-	{
-		accumulate_objective_measurements();
-
-		initialization_samples[steps_ - 2][0] = pending_time;
-		initialization_samples[steps_ - 2][1] = pending_energy;
-		initialization_samples[steps_ - 2][2] = pending_threads;
-
-		reset_accumulated_measurements();
+	max_threads_=threads;
+	threads_param_=threads;
+	#if 0
+	double threads_tick = threads / 5.;
 
-		initialization_params[steps_ - 2][1] = getCurrentFrequencyIdx();
-	}
+	if ( threads_tick < 1.0 )
+		threads_tick = 1.0;
 	
-	// VV: Place reasonable limits to #threads and cpu_freq tunable knobs
-	double min_threads = round(max_threads_ * 0.25);
+	threads_dt = (int) round(threads_tick);
+	#elif 0
+	if ( max_threads_ <= 4 )
+		threads_dt = 1.;
+	else if ( max_threads_ <= 8 )
+		threads_dt = 2.;
+	else if ( max_threads_ <= 32 )
+		threads_dt = 4.;
+	else
+		threads_dt = 8.;
+	#else 
+		threads_dt = 1.;
+	#endif
+}
 
-	if (min_threads < 1.0)
-		min_threads = 1.0;
+#ifdef ALLSCALE_HAVE_CPUFREQ
+void localoptimizer::initialize_nmd()
+{
+	// VV: Place reasonable limits to #threads and cpu_freq tunable knobs
 
-	double constraint_min[] = {min_threads, 0};
-	double constraint_max[] = {(double)max_threads_,
+	double constraint_min[] = {1, 0};
+	double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
 							   (double)frequencies_param_allowed_.size() - 1};
 
-	nmd.initialize_simplex(initialization_params, 
-						   initialization_samples,
-						   opt_weights,
+	nmd.initialize_simplex(opt_weights,
 						   constraint_min, constraint_max);
 
 	mo_initialized = true;
@@ -242,38 +249,6 @@ actuation localoptimizer::step()
 #ifdef ALLSCALE_HAVE_CPUFREQ
 	else if (optmethod_ == allscale)
 	{
-		if (steps_ <= warmup_steps_)
-		{
-#ifdef DEBUG_MULTIOBJECTIVE_
-			std::cout << "[LOCALOPTIMIZER|INFO] Optimizer No-OP: either at warm-up or optimizer has completed\n";
-#endif
-			// set some random parametrization to collect at least 3 different
-			// vertices to be used as input to the optimizer
-
-			//VV: TODO Ensure that we don't pick the same 3 configurations
-			float bucket_dt = steps_ / (float)warmup_steps_;
-			float _min_threads = max_threads_ * bucket_dt;
-
-			act.delta_threads = rand() % (int)ceil(bucket_dt) + roundf(_min_threads);
-
-			float _min_freqs = frequencies_param_allowed_.size() * bucket_dt;
-			act.frequency_idx = rand() % (int)ceil(bucket_dt) + roundf(_min_freqs);
-
-			if (steps_ > 1)
-			{
-				accumulate_objective_measurements();
-				initialization_samples[steps_ - 2][0] = pending_time;
-				initialization_samples[steps_ - 2][1] = pending_energy;
-				initialization_samples[steps_ - 2][2] = pending_threads;
-				reset_accumulated_measurements();
-				initialization_params[steps_ - 2][0] = getCurrentThreads();
-
-			initialization_params[steps_ - 2][1] = getCurrentFrequencyIdx();
-
-			}
-			goto validate_act;
-		}
-
 		if (mo_initialized == false)
 			initialize_nmd();
 				
@@ -285,6 +260,7 @@ actuation localoptimizer::step()
 
 		if ( explore_knob_domain ){
 			optstepresult nmd_res = nmd.step(latest_measurements);
+
 #ifdef DEBUG_MULTIOBJECTIVE_
 			std::cout << "[LOCALOPTIMIZER|DEBUG] New Vertex to try:";
 			std::cout << " Threads = " << nmd_res.threads;
@@ -311,6 +287,11 @@ actuation localoptimizer::step()
 				act.delta_threads = nmd_res.threads;
 				act.frequency_idx = nmd_res.freq_idx;
 			}
+			
+			act.delta_threads *= threads_dt;
+			std::cout << "[LOCALOPTIMIZER|DEBUG] ACTUAL Vertex to try:";
+			std::cout << " Threads = " << act.delta_threads;
+			std::cout << " Freq Idx = " << act.frequency_idx << std::endl;
 		}
 	}
 #endif // ALLSCALE_HAVE_CPUFREQ
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 414382c..c878199 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -35,7 +35,7 @@ NelderMead::NelderMead(double eps)
     std::cout << "[NelderMead|INFO] Initial Convergence Threshold set is " << EPSILON << std::endl;
 #endif
     itr = 0;
-    state_ = start;
+    state_ = warmup;
 
     /* dynamically allocate arrays */
 
@@ -46,6 +46,8 @@ NelderMead::NelderMead(double eps)
     ve = (double *)malloc(n * sizeof(double));
     vc = (double *)malloc(n * sizeof(double));
     vm = (double *)malloc(n * sizeof(double));
+    
+    warming_up_step = 0;
 
     /* allocate the columns of the arrays */
     for (i = 0; i <= n; i++)
@@ -54,6 +56,143 @@ NelderMead::NelderMead(double eps)
     }
 }
 
+std::pair<int, NelderMead::direction> NelderMead::explore_next_extra(double *extra, int level, 
+                                direction dir, 
+                                int level_max, int level_nested_max)
+{
+    /*
+    const char *to_string[] = {
+        "up", "up_final", "down", "left", "right", "right_final"
+    };
+    */
+    if ( extra[0] == 0.0 && extra[1] == 0.0 ) {
+        extra[1] = 1.0;
+
+        return std::make_pair(level, dir);
+    }
+    switch (dir) {
+        case (direction::up):
+            if ( extra[1] < level ) {
+                extra[1] += 1.;
+            } else if( extra[0] < level_nested_max ) {
+                extra[0] += 1.;
+                dir = direction::right;
+            } else {
+                level ++;
+            }
+        break;
+
+        case (direction::up_final):
+            if ( extra[1] < level ) {
+                extra[1] += 1.;
+            } else if( extra[0] < level_nested_max ) {
+                extra[0] += 1.;
+                dir = direction::right_final;
+            } else {
+                level ++;
+            }
+        break;
+
+
+        case (direction::down):
+            if ( extra[1] > -level ) {
+                extra[1] -= 1.0;
+            } else if ( extra[0] > -level_nested_max ){
+                extra[0] -= 1.0;
+                dir = direction::left;
+            }
+        break;
+
+        case (direction::left):
+            if ( extra[0] > -level_nested_max ) {
+                extra[0] -= 1.0;
+            } else if (extra[1] < level ) {
+                extra[1] += 1.0;
+                dir = direction::up_final;
+            }
+        break;
+
+        case (direction::right):
+            if ( extra[0] < level_nested_max ) {
+                extra[0] += 1.;
+            } else if ( extra[1] <= level ) {
+                extra[1] -= 1.;
+                dir = direction::down;
+            }
+        break;
+        
+        case (direction::right_final):
+        if ( extra[0] < 0. ) {
+            extra[0] += 1.;
+        } else {
+            level ++; 
+            extra[0] = 0.0;
+            extra[1] = level;
+            dir = direction::right;
+        }
+        break;
+    }
+
+    return std::make_pair(level, dir);
+}
+
+template <typename F>
+void NelderMead::generate_new(F &gen)
+{
+    double extra[] = {0, 0};
+    double *new_set;
+    int i = 0;
+    int max_combinations = (constraint_max[0] - constraint_min[0]+1) 
+                            * (constraint_max[1] - constraint_min[1]+1);
+    int level = 1;
+    int max_nested_level = constraint_max[1] - constraint_min[1] +1;
+    int max_level = constraint_max[0] - constraint_min[0] +1;
+    direction dir = direction::right;
+
+    // VV: Search for a twice as big space to take into account that
+    //     new_set is not *actually* at 0, 0
+
+    max_level *= 2;
+    max_nested_level *=2;
+
+    int is_same;
+    do
+    {
+        new_set = gen(extra);
+        
+        auto key = std::make_pair((int)new_set[0], (int)new_set[1]);
+        auto entry = cache_.find(key);
+        is_same = (entry != cache_.end());
+
+        if ( ( level < max_level +1) 
+             && is_same 
+             && max_combinations > (NMD_NUM_KNOBS + 1))
+        {
+            # if 0
+            extra[0] = rand() % (int)(constraint_max[0] - constraint_min[0]) 
+                            + (int)constraint_min[0] 
+                            - (int)(0.5 * (constraint_max[0] - constraint_min[0]));
+
+            extra[1] = rand() % (int)(constraint_max[1] - constraint_min[1]) 
+                                + (int)constraint_min[1] 
+                                - (int)(0.5 * (constraint_max[1] - constraint_min[1]));
+            #else
+            auto logistics = explore_next_extra(extra, level, dir, 
+                                                max_level, max_nested_level);
+            level = logistics.first;
+            dir = logistics.second;
+
+            #endif
+            OUT_DEBUG(
+                std::cout << "[NelderMead|Debug] Rejecting " 
+                    << new_set[0] << " " << new_set[1] <<  std::endl;
+            )
+        } else {
+            break;
+        }
+    } while ( 1 );
+}
+
 void NelderMead::my_constraints(double x[])
 {
     // round to integer and bring again with allowable margins
@@ -126,7 +265,7 @@ bool NelderMead::cache_update(int threads, int freq_idx,
 
 double NelderMead::evaluate_score(const double objectives[], const double *weights) const
 {
-    double score = 0.0f;
+    double score;
     // VV: [time, energy/power, resources]
     double scale[] = {1.0, 1000.0, 1.0};
     scale[2] = (double)constraint_max[0];
@@ -134,12 +273,19 @@ double NelderMead::evaluate_score(const double objectives[], const double *weigh
     if (weights == nullptr)
         weights = opt_weights;
 
+    #if 0
+    score = 0.0;
     for (auto i = 0; i < NMD_NUM_OBJECTIVES; ++i)
     {
         double t = objectives[i] / scale[i];
         score += t * t * weights[i];
     }
-
+    #else 
+    score = 0.0;
+    for ( auto i=0; i<NMD_NUM_OBJECTIVES; ++ i) {
+        score += exp(weights[i]*objectives[i]/scale[i]);
+    }
+    #endif
     return score;
 }
 
@@ -157,9 +303,7 @@ void NelderMead::set_weights(double weights[3])
 }
 
 /* FIXME: generalize */
-void NelderMead::initialize_simplex(double params[][2],
-                                    double objectives[][3],
-                                    double weights[3],
+void NelderMead::initialize_simplex(double weights[3],
                                     double constraint_min[2],
                                     double constraint_max[2])
 {
@@ -173,50 +317,9 @@ void NelderMead::initialize_simplex(double params[][2],
     }
 
     set_weights(weights);
-
-    // VV: Need num_knobs +1
-    for (i = 0; i < NMD_NUM_KNOBS + 1; i++)
-    {
-        f[i] = evaluate_score(objectives[i], weights);
-
-        for (j = 0; j < n; j++)
-        {
-            v[i][j] = params[i][j];
-        }
-
-        my_constraints(v[i]);
-
-        optstepresult entry;
-        entry.threads = round(v[i][0]);
-        entry.freq_idx = round(v[i][1]);
-
-        // VV: Check if we can re-use a previously explored configuration
-        auto key = std::make_pair(entry.threads, entry.freq_idx);
-
-        auto past_entry = cache_.find(std::make_pair(entry.threads,
-                                                     entry.freq_idx));
-        if (past_entry != cache_.end())
-        {
-            for (j = 0; j < NMD_NUM_OBJECTIVES; ++j)
-                past_entry->second.objectives[j] = objectives[i][j];
-
-            past_entry->second._cache_timestamp = timestamp_now;
-            // VV: Skip attempting to re-insert the "same" entry
-            continue;
-        }
-
-        // VV: If we've reached this point we need to add the entry to the cache
-        for (j = 0; j < NMD_NUM_OBJECTIVES; ++j)
-            entry.objectives[j] = objectives[i][j];
-
-        entry._cache_timestamp = timestamp_now;
-        entry._cache_expires_dt = CACHE_EXPIRE_AFTER_MS;
-
-        cache_.insert(std::make_pair(key, entry));
-    }
+    state_ = warmup;
     itr = 0;
-
-    state_ = start;
+    warming_up_step = 0;
 }
 
 /* print out the initial values */
@@ -326,68 +429,31 @@ void NelderMead::sort_vertices()
     }
 }
 
-bool NelderMead::knob_set_exists(double knobs[2], int exclude)
-{
-    int is_same;
-
-    for (auto i=0; i<NMD_NUM_KNOBS+1; ++i) {
-        if ( i != exclude ) {
-            is_same = 1;
-            for ( auto j=0; j<NMD_NUM_KNOBS; ++j ) 
-                is_same &= (v[i][j] == knobs[j]);
-            
-            if ( is_same )
-                return true;
-        }
-    }
-
-    return false;
-}
-
 optstepresult NelderMead::do_step_start()
 {
     optstepresult res;
 
-    itr++;
     OUT_DEBUG(
         std::cout << "[NelderMead DEBUG] State = Start" << std::endl;
-        print_initial_simplex();)
+        print_initial_simplex();
+    )
 
     sort_vertices();
 
-    centroid();
-    double extra[2] = {0.0, 0.0};
-    int is_invalid = 0;
-    int max_combinations = 0;
-
-    max_combinations = (constraint_max[0] - constraint_min[0]+1) * (constraint_max[1] - constraint_min[1]+1);
-
+    centroid();   
 
     // VV: Try not to pick a knob_set that already exists in `v`
-    do {
+    auto gen_new = [this](double *extra) mutable -> double* {
+        
         for (j = 0; j < NMD_NUM_KNOBS; j++)
             vr[j] = vm[j] + ALPHA * (vm[j] - v[vg][j]) + extra[j];
-        
+       
         my_constraints(vr);
-        
-        is_invalid = 0;
 
-        if ( max_combinations > NMD_NUM_KNOBS +1 ) {
-            is_invalid = knob_set_exists(vr, -1);
-
-            if ( is_invalid ) {
-                extra[0] = rand() % (int)(constraint_max[0] - constraint_min[0])
-                            + (int) constraint_min[0]
-                            - (int)(0.5*(constraint_max[0] - constraint_min[0]));
-
-                extra[1] = rand() % (int)(constraint_max[1] - constraint_min[1])
-                            + (int) constraint_min[1]
-                            - (int)(0.5*(constraint_max[1] - constraint_min[1]));
-                
-            }
-        } 
-        
-    } while ( is_invalid );
+        return vr;
+    };
+    
+    generate_new(gen_new);
 
 #ifdef NMD_DEBUG_
     std::cout << "[NelderMead DEBUG] Reflection Parameter = ("
@@ -450,38 +516,16 @@ optstepresult NelderMead::do_step_reflect(const double objectives[])
     else if (fr < f[vs])
     {
         // VV: REFLECTED is better than BEST
-
-        double extra[2] = {0.0, 0.0};
-        int is_invalid = 0;
-        int max_combinations = 0;
-
-        max_combinations = (constraint_max[0] - constraint_min[0]+1) * (constraint_max[1] - constraint_min[1]+1);
-
-        // VV: Try not to pick a knob_set that already exists in `v`
-        do {
+        auto gen_new = [this](double *extra) mutable -> double* {
             for (j = 0; j < NMD_NUM_KNOBS; j++)
                 ve[j] = vm[j] + GAMMA * (vr[j] - vm[j]) + extra[j];
-            
+                
             my_constraints(ve);
-            
-            is_invalid = 0;
-
-            if ( max_combinations > NMD_NUM_KNOBS +1 ) {
-                is_invalid = knob_set_exists(ve, -1);
-
-                if ( is_invalid ) {
-                    extra[0] = rand() % (int)(constraint_max[0] - constraint_min[0])
-                                + (int) constraint_min[0]
-                                - (int)(0.5*(constraint_max[0] - constraint_min[0]));
-
-                    extra[1] = rand() % (int)(constraint_max[1] - constraint_min[1])
-                                + (int) constraint_min[1]
-                                - (int)(0.5*(constraint_max[1] - constraint_min[1]));
-                    
-                }
-            } 
-            
-        } while ( is_invalid );
+
+            return ve;
+        };
+    
+        generate_new(gen_new);
 
         // VV: Now evaluate EXPANDED
         res.threads = ve[0];
@@ -509,38 +553,17 @@ optstepresult NelderMead::do_step_reflect(const double objectives[])
     else if ((f[vh] <= fr) && (fr < f[vg]))
     {
         // VV: REFLECTED between SECOND BEST and WORST
-        double extra[2] = {0.0, 0.0};
-        int is_invalid = 0;
-        int max_combinations = 0;
-
-        max_combinations = (constraint_max[0] - constraint_min[0]+1) * (constraint_max[1] - constraint_min[1]+1);
-
-        // VV: Try not to pick a knob_set that already exists in `v`
-        do {
+        auto gen_new = [this](double *extra) mutable -> double* {
             for (j = 0; j < NMD_NUM_KNOBS; j++)
                 vc[j] = vm[j] + BETA * (vr[j] - vm[j]) + extra[j];
-            
+                
             my_constraints(vc);
-            
-            is_invalid = 0;
-
-            if ( max_combinations > NMD_NUM_KNOBS +1 ) {
-                is_invalid = knob_set_exists(vc, -1);
-
-                if ( is_invalid ) {
-                    extra[0] = rand() % (int)(constraint_max[0] - constraint_min[0])
-                                + (int) constraint_min[0]
-                                - (int)(0.5*(constraint_max[0] - constraint_min[0]));
-
-                    extra[1] = rand() % (int)(constraint_max[1] - constraint_min[1])
-                                + (int) constraint_min[1]
-                                - (int)(0.5*(constraint_max[1] - constraint_min[1]));
-                    
-                }
-            } 
-            
-        } while ( is_invalid );
 
+            return vc;
+        };
+
+        generate_new(gen_new);  
+        
         // VV: Now evaluate EXPANDED
         res.threads = vc[0];
         res.freq_idx = vc[1];
@@ -567,37 +590,16 @@ optstepresult NelderMead::do_step_reflect(const double objectives[])
     else
     {
         // VV: REFLECTED worse than WORST
-        double extra[2] = {0.0, 0.0};
-        int is_invalid = 0;
-        int max_combinations = 0;
-
-        max_combinations = (constraint_max[0] - constraint_min[0]+1) * (constraint_max[1] - constraint_min[1]+1);
-
-        // VV: Try not to pick a knob_set that already exists in `v`
-        do {
+        auto gen_new = [this](double *extra) mutable -> double* {
             for (j = 0; j < NMD_NUM_KNOBS; j++)
                 vc[j] = vm[j] - BETA * (vr[j] - vm[j]) + extra[j];
-            
+                
             my_constraints(vc);
-            
-            is_invalid = 0;
-
-            if ( max_combinations > NMD_NUM_KNOBS +1 ) {
-                is_invalid = knob_set_exists(vc, -1);
-
-                if ( is_invalid ) {
-                    extra[0] = rand() % (int)(constraint_max[0] - constraint_min[0])
-                                + (int) constraint_min[0]
-                                - (int)(0.5*(constraint_max[0] - constraint_min[0]));
-
-                    extra[1] = rand() % (int)(constraint_max[1] - constraint_min[1])
-                                + (int) constraint_min[1]
-                                - (int)(0.5*(constraint_max[1] - constraint_min[1]));
-                    
-                }
-            } 
-            
-        } while ( is_invalid );
+
+            return vc;
+        };
+
+        generate_new(gen_new);
 
         // VV: Now evaluate EXPANDED
         res.threads = vc[0];
@@ -686,37 +688,17 @@ optstepresult NelderMead::do_step_contract(const double objectives[])
     {
         // VV: Replace SECOND BEST
         double new_vh[NMD_NUM_KNOBS];
-        double extra[NMD_NUM_KNOBS] = {0.0, 0.0};
-        int is_invalid = 0;
-        int max_combinations = 0;
-
-        max_combinations = (constraint_max[0] - constraint_min[0]+1) * (constraint_max[1] - constraint_min[1]+1);
-
-        // VV: Try not to pick a knob_set that already exists in `v`
-        do {
-            for (j = 0; j < NMD_NUM_KNOBS; j++)
+        
+        auto gen_new = [this, &new_vh](double *extra) mutable -> double* {
+            for (auto j = 0; j < NMD_NUM_KNOBS; j++)
                 new_vh[j] = v[vs][j] + DELTA * (v[vh][j] - v[vs][j]) + extra[j];
-            
+                
             my_constraints(new_vh);
-            
-            is_invalid = 0;
-
-            if ( max_combinations > NMD_NUM_KNOBS +1 ) {
-                is_invalid = knob_set_exists(new_vh, -1);
-
-                if ( is_invalid ) {
-                    extra[0] = rand() % (int)(constraint_max[0] - constraint_min[0])
-                                + (int) constraint_min[0]
-                                - (int)(0.5*(constraint_max[0] - constraint_min[0]));
-
-                    extra[1] = rand() % (int)(constraint_max[1] - constraint_min[1])
-                                + (int) constraint_min[1]
-                                - (int)(0.5*(constraint_max[1] - constraint_min[1]));
-                    
-                }
-            } 
-            
-        } while ( is_invalid );
+
+            return new_vh;
+        };
+
+        generate_new(gen_new);
 
         for (j = 0; j < NMD_NUM_KNOBS; j++)
             v[vh][j] = new_vh[j];
@@ -769,15 +751,69 @@ optstepresult NelderMead::step(const double objectives[])
     optstepresult res;
     res.threads = 0;
     res.freq_idx = -1;
-    std::cout << "Starting step with "
-                << objectives[0] << " " 
-                << objectives[1] << " " 
-                << objectives[2] << std::endl;
+    OUT_DEBUG(
+        std::cout << "[NelderMead|DEBUG] Starting step with "
+            << objectives[0] << " " 
+            << objectives[1] << " " 
+            << objectives[2] << std::endl;
+    )
     
+    std::size_t tested_combinations = cache_.size();
+
     switch (state_)
     {
+    case warmup:
+    {
+        #ifdef NMD_DEBUG_
+            std::cout << "[NelderMead|DEBUG] State = Warmup " 
+                      << warming_up_step << std::endl;
+        #endif
+        if ( warming_up_step > 0 ) {
+            // VV: Record results of last warming up step
+            f[warming_up_step-1] = evaluate_score(objectives, nullptr);
+            cache_update(v[warming_up_step-1][0], v[warming_up_step-1][1], 
+                         objectives, true);
+        }
 
+        if ( warming_up_step == NMD_NUM_KNOBS + 1) {
+            // VV: We need not explore the knob_set space anymore
+            state_ = start;
+            return step(objectives);
+        }
+
+        // VV: Start at 25% threads with lowest CPU Freq, then 75% threads with max freq
+        //     and 100% threads with max freq
+
+        int threads_low = round(0.25 * (constraint_max[0] - constraint_min[1]) 
+                    + constraint_min[1]);
+        int threads_med = round(0.75 * (constraint_max[0] - constraint_min[1])
+                    + constraint_min[1]);
+        int threads_max = constraint_max[0];
+
+        const int initial_configurations[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS] = {
+            {threads_low, (int)constraint_min[1]},
+            {threads_med, (int)constraint_max[1]},
+            {threads_max, (int)constraint_max[1]},
+        };
+
+        optstepresult res;
+        res.objectives[0] = -1;
+        res.objectives[1] = -1;
+        res.objectives[2] = -1;
+        res.converged = false;
+        res.score = -1;
+        res.threads = initial_configurations[warming_up_step][0];
+        res.freq_idx = initial_configurations[warming_up_step][1];
+        
+        v[warming_up_step][0] = res.threads;
+        v[warming_up_step][1] = res.freq_idx;
+        warming_up_step++;
+
+        return res;
+    }
+    break;
     case start:
+        itr++;
         res = do_step_start();
         break;
     case reflection:
@@ -798,14 +834,17 @@ optstepresult NelderMead::step(const double objectives[])
         return res;
     }
 
-    res.converged = testConvergence();
+    res.converged = testConvergence(tested_combinations);
 
     if (res.converged == true)
     {
         res.threads = v[vs][0];
         res.freq_idx = v[vs][1];
-        std::cout << "Converged to " << res.threads << " " << res.freq_idx << std::endl;
+        OUT_DEBUG(
+            std::cout << "[NelderMead|DEBUG] Converged to " << res.threads << " " << res.freq_idx << std::endl;
+        )
     }
+
     std::cout << "Stop step with "
                 << objectives[0] << " " 
                 << objectives[1] << " " 
@@ -814,7 +853,7 @@ optstepresult NelderMead::step(const double objectives[])
     return res;
 }
 
-bool NelderMead::testConvergence()
+bool NelderMead::testConvergence(std::size_t tested_combinations)
 {
     double temp;
     #if 0
@@ -852,12 +891,32 @@ bool NelderMead::testConvergence()
     std::cout << "[NelderMead|INFO] Convergence Ratio is " << s << std::endl;
     std::cout << "[NelderMead|INFO] Convergence Threshold set is " << EPSILON << std::endl;
 #endif
-    if (s >= EPSILON && itr <= MAXITERATIONS)
+    int max_combinations = (constraint_max[0] - constraint_min[0]+1) 
+                            * (constraint_max[1] - constraint_min[1]+1);
+
+    if ( (s >= EPSILON)
+        && (itr <= MAXITERATIONS)
+        && (max_combinations != tested_combinations) )
         return false;
     else
     {
         sort_vertices();
         min = f[vs];
+
+        OUT_DEBUG(
+            std::cout << "[NelderMead|Debug] Cache_ Max: " << max_combinations 
+                        << " explored " << tested_combinations << std::endl;
+            for (const auto &entry: cache_ ) {
+                std::cout << "[NelderMead|Debug] Cache_ " 
+                    << entry.second.threads << " " 
+                    << entry.second.freq_idx << " :: "
+                    << entry.second.objectives[0] << " "
+                    << entry.second.objectives[1] << " "
+                    << entry.second.objectives[2] << " :: "
+                    << evaluate_score(entry.second.objectives, nullptr) << std::endl;
+            }
+        )
+
         return true;
     }
 }
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 3b4aaba..cb5d936 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -804,7 +804,7 @@ void scheduler::optimize_locally(work_item const& work)
 
             elapsedTimeMs = t_duration_now - last_optimization_timestamp_;
 
-            if (elapsedTimeMs > optimization_period_ms){
+            if (elapsedTimeMs > optimization_period_ms || nr_opt_steps == 0){
                 last_optimization_timestamp_= t_duration_now;
                 nr_opt_steps++;
                 actuation act_temp = lopt_.step();

From 755f0b6fbdd6f36e788fceebd19abf11740f6e31 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Wed, 14 Nov 2018 16:43:01 +0000
Subject: [PATCH 07/37] Default to highest CPU if using CPUFREQ

Do a final run once NMD converges to make sure that the scores of the
simplices are not stale
---
 allscale/components/localoptimizer.hpp   | 389 ++++++++++++-----------
 allscale/components/nmsimplex_bbincr.hpp |   3 +
 src/components/localoptimizer.cpp        |  19 ++
 src/components/nmsimplex_bbincr.cpp      |  61 +++-
 src/components/scheduler_component.cpp   |  12 +-
 5 files changed, 285 insertions(+), 199 deletions(-)

diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index 4b2d1ce..d59bf16 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -19,243 +19,270 @@
 #define DEBUG_ 1
 #define DEBUG_MULTIOBJECTIVE_ 1
 
-namespace allscale { namespace components {
-
-    enum objectiveType {time, energy, resource};
-
-    enum parameterType {thread, frequency};
-
-    enum searchPolicy {allscale, random, manual};
-
-    /* structure type of a single optimization objective */
-    struct objective{
-      double last_scores[3];
-
-      objectiveType type;
-      /* leeway threshold desired, 0-1 double */
-      double leeway;
-      /* non-negative integer priority of the objective, 0 is highest priority*/
-      int priority;
-      /* local minimum during single objective optimization */
-      double localmin;
-      /* local maximum during single objective optimization */
-      double localmax;
-      /* local minimum during single objective optimization */
-      double globalmin;
-      /* local minimum during single objective optimization */
-      double globalmax;
-      /* current deviation of the objective value from observed min */
-      double currentthreshold;
-      /* sampled objective values throughout execution */
-      std::vector<double> samples;
-      /* thread number that lead to the objective value in samples vector */
-      std::vector<double> threads_samples;
-      /* frequency index that lead to the objective value in samples vector */
-      std::vector<double> freq_samples;
-      /* true if optimization of objective has converged, false otherwise */
-      bool converged;
-      /* true if optimizer for objective has been initialized, false otherwise */
-      bool initialized;
-      /* index to the parameter vectors for setup that has so far achieved
+namespace allscale
+{
+namespace components
+{
+
+enum objectiveType
+{
+	time,
+	energy,
+	resource
+};
+
+enum parameterType
+{
+	thread,
+	frequency
+};
+
+enum searchPolicy
+{
+	allscale,
+	random,
+	manual
+};
+
+/* structure type of a single optimization objective */
+struct objective
+{
+	double last_scores[3];
+
+	objectiveType type;
+	/* leeway threshold desired, 0-1 double */
+	double leeway;
+	/* non-negative integer priority of the objective, 0 is highest priority*/
+	int priority;
+	/* local minimum during single objective optimization */
+	double localmin;
+	/* local maximum during single objective optimization */
+	double localmax;
+	/* local minimum during single objective optimization */
+	double globalmin;
+	/* local minimum during single objective optimization */
+	double globalmax;
+	/* current deviation of the objective value from observed min */
+	double currentthreshold;
+	/* sampled objective values throughout execution */
+	std::vector<double> samples;
+	/* thread number that lead to the objective value in samples vector */
+	std::vector<double> threads_samples;
+	/* frequency index that lead to the objective value in samples vector */
+	std::vector<double> freq_samples;
+	/* true if optimization of objective has converged, false otherwise */
+	bool converged;
+	/* true if optimizer for objective has been initialized, false otherwise */
+	bool initialized;
+	/* index to the parameter vectors for setup that has so far achieved
          the minimum over all samples */
-      long int min_params_idx;
-      double converged_minimum;
-      double minimization_params[2];
-    };
+	long int min_params_idx;
+	double converged_minimum;
+	double minimization_params[2];
+};
 
-    
-    /* structure type modelling an optimization actuation action to be taken
+/* structure type modelling an optimization actuation action to be taken
        by the scheduler */
-    struct actuation{
-       /* number of threads to resume (>0) or suspend (<0). If set to zero,
+struct actuation
+{
+	/* number of threads to resume (>0) or suspend (<0). If set to zero,
           number of threads will stay unchanged. */
-       unsigned int delta_threads;
+	unsigned int delta_threads;
 
 #if defined(ALLSCALE_HAVE_CPUFREQ)
-        /* index to the global cpu-supported frequencies vector pointing to
+	/* index to the global cpu-supported frequencies vector pointing to
            the new frequency to be set. If set to -1, frequency will stay
            unchanged */
-       int frequency_idx;
-       int previous_frequency_idx;
+	int frequency_idx;
+	int previous_frequency_idx;
 #endif
-    };
-
-    struct localoptimizer
-    {
-        localoptimizer()
-            :nmd(convergence_threshold_),
-            pending_threads(0.),
-            pending_energy(0.),
-            pending_time(0.),
-            pending_num_times(0.),
-            mo_initialized(false),
+};
+
+struct localoptimizer
+{
+	localoptimizer()
+		: pending_threads(0.),
+		  pending_energy(0.),
+		  pending_time(0.),
+		  pending_num_times(0.),
+		  mo_initialized(false),
 #if defined(ALLSCALE_HAVE_CPUFREQ)
-            frequency_param_(0),
+		  frequency_param_(0),
 #endif
-            current_objective_idx_(0),converged_(false)
-    {
-        if (optmethod_==random)
-            srand (std::time(NULL));
-        }
-
-        localoptimizer(std::list<objective>);
-
-        bool isConverged(){return converged_;}
-
-        void setPolicy(searchPolicy pol){
-          optmethod_ = pol;
+		  current_objective_idx_(0), 
+		  converged_(false),
+		  convergence_threshold_(0.01),
+		  nmd(0.01)
+	{
+		if (optmethod_ == random)
+			srand(std::time(NULL));
+	}
+	localoptimizer(std::list<objective>);
+
+	bool isConverged();
+
+	void setPolicy(searchPolicy pol)
+	{
+		optmethod_ = pol;
 #ifdef DEBUG_
-          std::cout << "Local Optimizer Initialized with "
-                    << policyToString(pol)
-                    << " policy for multi-objective search."
-                    << std::endl;
+		std::cout << "Local Optimizer Initialized with "
+				  << policyToString(pol)
+				  << " policy for multi-objective search."
+				  << std::endl;
 #endif
-        }
+	}
 #ifdef ALLSCALE_HAVE_CPUFREQ
-        void initialize_nmd();
+	void initialize_nmd();
 #endif
-        double opt_weights[NMD_NUM_OBJECTIVES];
+	double opt_weights[NMD_NUM_OBJECTIVES];
 
-        searchPolicy getPolicy(){return optmethod_;}
+	searchPolicy getPolicy() { return optmethod_; }
 
-        void setobjectives(std::list<objective>);
+	void setobjectives(std::list<objective>);
 
-        std::size_t getCurrentThreads(){return threads_param_;}
+	std::size_t getCurrentThreads() { return threads_param_; }
 
-        void setCurrentThreads(std::size_t threads){threads_param_ = threads;}
+	void setCurrentThreads(std::size_t threads) { threads_param_ = threads; }
 
 #if defined(ALLSCALE_HAVE_CPUFREQ)
-        unsigned int getCurrentFrequencyIdx(){return frequency_param_;}
-
-        void setCurrentFrequencyIdx(unsigned int idx){frequency_param_ = idx;}
-
-        const std::vector<unsigned long>
-        setfrequencies(std::vector<unsigned long> frequencies){
-            frequencies_param_allowed_=frequencies;
-            //std::cout << "**************** = " << frequency_param_ << std::endl;
-            //for(auto& el: frequencies_param_allowed_)
-            //  std::cout << "***>>>> " << el << std::endl;
-            return frequencies_param_allowed_;
-        }
+	unsigned int getCurrentFrequencyIdx()
+	{
+		return frequency_param_;
+	}
+
+	void setCurrentFrequencyIdx(unsigned int idx) { frequency_param_ = idx; }
+
+	const std::vector<unsigned long>
+	setfrequencies(std::vector<unsigned long> frequencies)
+	{
+		frequencies_param_allowed_ = frequencies;
+		//std::cout << "**************** = " << frequency_param_ << std::endl;
+		//for(auto& el: frequencies_param_allowed_)
+		//  std::cout << "***>>>> " << el << std::endl;
+		return frequencies_param_allowed_;
+	}
 #endif
-        std::size_t getmaxthreads() {
-            return max_threads_;
-        }
+	std::size_t getmaxthreads()
+	{
+		return max_threads_;
+	}
 
-        void setmaxthreads(std::size_t threads);
+	void setmaxthreads(std::size_t threads);
 
-        /* executes one step of multi-objective optimization */
-        actuation step();
+	/* executes one step of multi-objective optimization */
+	actuation step();
 
-        /* adds a measurement sample to the specified objective */
-        void measureObjective(double iter_time, double power, double threads);
+	/* adds a measurement sample to the specified objective */
+	void measureObjective(double iter_time, double power, double threads);
 
-        /* restarts multi-objective optimization from current best solution */
-        void reset(int,int);
+	/* restarts multi-objective optimization from current best solution */
+	void reset(int, int);
 
 #ifdef DEBUG_
-        void printobjectives();
-        void printverbosesteps(actuation);
+	void printobjectives();
+	void printverbosesteps(actuation);
 #endif
 
-        std::string policyToString(searchPolicy pol){
-          std::string str;
-          switch (pol){
-            case random:
-              str = "random";
-              break;
-            case allscale:
-              str = "allscale";
-              break;
-            case manual:
-              str = "manual";
-              break;
-          }
-          return str;
-        }
-
-    private:
-        // VV: Used to convert thread_idx to actual number of threads
-        std::size_t threads_dt;
-
-        void accumulate_objective_measurements();
-        void reset_accumulated_measurements();
-
-        std::vector<double> samples_energy;
-        std::vector<double> samples_time;
-        std::vector<double> samples_threads;
-        std::vector<double> samples_freq;
-
-        bool explore_knob_domain;
-        
-        double pending_time, pending_energy, pending_threads;
-        unsigned long pending_num_times;
-
-        bool mo_initialized;
-
-        /* vector of active optimization objectives. Objectives are stored
+	std::string policyToString(searchPolicy pol)
+	{
+		std::string str;
+		switch (pol)
+		{
+		case random:
+			str = "random";
+			break;
+		case allscale:
+			str = "allscale";
+			break;
+		case manual:
+			str = "manual";
+			break;
+		}
+		return str;
+	}
+
+  private:
+	// VV: Used to convert thread_idx to actual number of threads
+	std::size_t threads_dt;
+
+	void accumulate_objective_measurements();
+	void reset_accumulated_measurements();
+
+	std::vector<double> samples_energy;
+	std::vector<double> samples_time;
+	std::vector<double> samples_threads;
+	std::vector<double> samples_freq;
+
+	bool explore_knob_domain;
+
+	double pending_time, pending_energy, pending_threads;
+	unsigned long pending_num_times;
+
+	bool mo_initialized;
+
+	/* vector of active optimization objectives. Objectives are stored
            in the vector in decreasing priority order */
-        std::vector<objective> objectives_;
+	std::vector<objective> objectives_;
 
-        NelderMead nmd;
+	NelderMead nmd;
 
-        /* counts number of parameter changes (as pair) */
-        unsigned long long int param_changes_;
+	/* counts number of parameter changes (as pair) */
+	unsigned long long int param_changes_;
 
-        /* single objective optimization method used */
-        searchPolicy optmethod_ = random;
+	/* single objective optimization method used */
+	searchPolicy optmethod_ = random;
 
-        /* active optimization parameter - nr of OS threads active */
-        int threads_param_;
+	/* active optimization parameter - nr of OS threads active */
+	int threads_param_;
 
-        /* ordered set of OS thread values that have been assigned to the
+	/* ordered set of OS thread values that have been assigned to the
            runtime by the optimization algorithm. The most recent value is
            stored at the end of the vector */
-        std::vector<unsigned long> thread_param_values_;
+	std::vector<unsigned long> thread_param_values_;
 
-        /* maximum number of OS threads supported by the runtime */
-        std::size_t max_threads_;
+	/* maximum number of OS threads supported by the runtime */
+	std::size_t max_threads_;
 
 #if defined(ALLSCALE_HAVE_CPUFREQ)
-        /* active optimization parameter - current CPU frequency index */
-        unsigned int frequency_param_;
+	/* active optimization parameter - current CPU frequency index */
+	unsigned int frequency_param_;
 
-        /* ordered set of frequency values that the CPU has been set to by
+	/* ordered set of frequency values that the CPU has been set to by
            the optimization algorithm. The most recent value is stored at the
            end of the vector */
-        std::vector<unsigned long> frequency_param_values_;
+	std::vector<unsigned long> frequency_param_values_;
 
-        /* vector containing sorted list of frequencies supported by the
+	/* vector containing sorted list of frequencies supported by the
            processor */
-        std::vector<unsigned long> frequencies_param_allowed_;
+	std::vector<unsigned long> frequencies_param_allowed_;
 #endif
 
-        /* threshold (percentage in [0,1]) to decide convergence of optimization
+	/* threshold (percentage in [0,1]) to decide convergence of optimization
            steps */
-        const double convergence_threshold_ = 0.01;
+	double convergence_threshold_;
 
-        /***** optimization state variables ******/
+	/***** optimization state variables ******/
 
-        /* index to the _objectives vector of currently optimized objective */
-        unsigned short int current_objective_idx_;
+	/* index to the _objectives vector of currently optimized objective */
+	unsigned short int current_objective_idx_;
 
-        /* number of times the optimizer step() has been invoked, this is for
+	/* number of times the optimizer step() has been invoked, this is for
            init and housekeeping purposes */
-         unsigned long long int steps_;
+	unsigned long long int steps_;
 
-        /* currently optimized parameter */
-        parameterType current_param_;
+	/* currently optimized parameter */
+	parameterType current_param_;
 
-        /* initial warm-up steps */
-        const unsigned int warmup_steps_=3;
+	/* initial warm-up steps */
+	const unsigned int warmup_steps_ = 3;
 
-        /* maximum number of optimization steps allowed */
-        const int max_steps_=100;
+	/* maximum number of optimization steps allowed */
+	const int max_steps_ = 100;
 
-        /* set to true if local optimizer has converged over all objectives */
-        bool converged_;
-    };
-}
-}
+	/* set to true if local optimizer has converged over all objectives */
+	bool converged_;
+};
+} // namespace components
+} // namespace allscale
 
 #endif
diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp
index 2674517..58844d9 100644
--- a/allscale/components/nmsimplex_bbincr.hpp
+++ b/allscale/components/nmsimplex_bbincr.hpp
@@ -163,6 +163,9 @@ class NelderMead
 		return rnum;
 	}
 
+	bool convergence_reevaluating;
+	int initial_configurations[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS];
+	
 	/* vertex with smallest value */
 	int vs;
 
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index 14158fa..fc8d114 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -126,6 +126,24 @@ void localoptimizer::printobjectives()
 	}
 }
 
+bool localoptimizer::isConverged()
+{	
+	#if 0
+	if ( converged_ == false ) {
+		return false;
+	}
+
+	auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+	if ( reexplore_every_ms >0 && timestamp_now - last_convergence_ts > reexplore_every_ms )
+	{	
+		std::cout << "[LOCALOPTIMIZER] Re-exploring space!" << std::endl;
+		initialize_nmd();
+	}
+	#endif 
+	return converged_; 
+}
+
 void localoptimizer::printverbosesteps(actuation act)
 {
 	static int last_frequency_idx = 0;
@@ -200,6 +218,7 @@ void localoptimizer::initialize_nmd()
 
 	mo_initialized = true;
 	explore_knob_domain = true;
+	converged_ = false;
 }
 #endif
 
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index c878199..4b51c71 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -48,6 +48,7 @@ NelderMead::NelderMead(double eps)
     vm = (double *)malloc(n * sizeof(double));
     
     warming_up_step = 0;
+    convergence_reevaluating = false;
 
     /* allocate the columns of the arrays */
     for (i = 0; i <= n; i++)
@@ -320,6 +321,23 @@ void NelderMead::initialize_simplex(double weights[3],
     state_ = warmup;
     itr = 0;
     warming_up_step = 0;
+    convergence_reevaluating = false;
+    cache_.clear();
+
+    int threads_low = round(0.25 * (constraint_max[0] - constraint_min[1]) 
+                + constraint_min[1]);
+    int threads_med = round(0.5 * (constraint_max[0] - constraint_min[1])
+                + constraint_min[1]);
+    int threads_high = constraint_max[0] * 0.75;
+
+    initial_configurations[0][0] = threads_low;
+    initial_configurations[0][1] = (int)constraint_min[1];
+
+    initial_configurations[1][0] = threads_med;
+    initial_configurations[1][1] = (int)constraint_max[1];
+
+    initial_configurations[2][0] = threads_high;
+    initial_configurations[2][1] = (int)constraint_max[1];
 }
 
 /* print out the initial values */
@@ -781,21 +799,6 @@ optstepresult NelderMead::step(const double objectives[])
             return step(objectives);
         }
 
-        // VV: Start at 25% threads with lowest CPU Freq, then 75% threads with max freq
-        //     and 100% threads with max freq
-
-        int threads_low = round(0.25 * (constraint_max[0] - constraint_min[1]) 
-                    + constraint_min[1]);
-        int threads_med = round(0.75 * (constraint_max[0] - constraint_min[1])
-                    + constraint_min[1]);
-        int threads_max = constraint_max[0];
-
-        const int initial_configurations[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS] = {
-            {threads_low, (int)constraint_min[1]},
-            {threads_med, (int)constraint_max[1]},
-            {threads_max, (int)constraint_max[1]},
-        };
-
         optstepresult res;
         res.objectives[0] = -1;
         res.objectives[1] = -1;
@@ -872,6 +875,7 @@ bool NelderMead::testConvergence(std::size_t tested_combinations)
         return true;
     }
     #endif
+    bool ret = false;
 
     fsum = 0.0;
     for (auto j = 0; j <= n; j++)
@@ -897,7 +901,7 @@ bool NelderMead::testConvergence(std::size_t tested_combinations)
     if ( (s >= EPSILON)
         && (itr <= MAXITERATIONS)
         && (max_combinations != tested_combinations) )
-        return false;
+        ret = false;
     else
     {
         sort_vertices();
@@ -917,7 +921,32 @@ bool NelderMead::testConvergence(std::size_t tested_combinations)
             }
         )
 
+        ret = true;
+    }
+
+    if ( ret == true && convergence_reevaluating == true ) {
         return true;
+    } else if ( ret == true ) {
+        // VV: Do another final run to make sure that the objective scores still hold up
+        OUT_DEBUG (
+            std::cout << "[NelderMead|Debug] Doing another final search" << std::endl;
+        )
+        state_ = warmup;
+        warming_up_step = 0;
+        itr --;
+        convergence_reevaluating = true;
+
+        for (auto i=0; i<NMD_NUM_KNOBS+1; ++i ) {
+            for (auto j=0; j<NMD_NUM_KNOBS; ++j) {
+                initial_configurations[i][j] = v[i][j];
+            }
+        }
+
+        print_initial_simplex();
+
+        return false;
+    } else {
+        return false;
     }
 }
 
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index cb5d936..6323faf 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -101,8 +101,8 @@ scheduler::scheduler(std::uint64_t rank)
 #ifdef DEBUG_KOSTAS
   std::cout << "DEBUG_KOSTAS is defined" << std::endl << std::flush;
 #endif
-#ifdef ALLSCALE_HAVE_CPUFREQ_
-  std::cout << "ALLSCALE_HAVE_CPUFREQ_ is defined" << std::endl << std::flush;
+#ifdef ALLSCALE_HAVE_CPUFREQ
+  std::cout << "ALLSCALE_HAVE_CPUFREQ is defined" << std::endl << std::flush;
 #endif
 
 }
@@ -524,6 +524,14 @@ void scheduler::init() {
     lopt_.printobjectives();
 #endif
   }
+#if defined(ALLSCALE_HAVE_CPUFREQ)
+else {
+    using hardware_reconf = allscale::components::util::hardware_reconf;
+    auto  freqs = hardware_reconf::get_frequencies(0);
+    // VV: Set maximum frequency
+    fix_allcores_frequencies(freqs[freqs.size()-1]);
+}
+#endif
 }
 
 /**

From cc23131d952be5989beb620f9f0424f853f84b5b Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Thu, 15 Nov 2018 10:08:55 +0000
Subject: [PATCH 08/37] Improved Power and Active Thread count logging when
 measuring statistics

---
 allscale/components/localoptimizer.hpp |  19 ++++
 allscale/components/scheduler.hpp      |   5 +-
 src/components/localoptimizer.cpp      |   1 +
 src/components/nmsimplex_bbincr.cpp    |  71 ++++++++++++-
 src/components/scheduler_component.cpp | 142 ++++++++++++-------------
 5 files changed, 159 insertions(+), 79 deletions(-)

diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index d59bf16..cfc43b5 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -155,6 +155,25 @@ struct localoptimizer
 	const std::vector<unsigned long>
 	setfrequencies(std::vector<unsigned long> frequencies)
 	{
+		#if 1
+		const std::size_t max_freqs = 10;
+		std::size_t keep_every = (std::size_t) ceilf(frequencies.size() / (float) max_freqs);
+
+		if ( keep_every > 1 ) {
+			std::vector<unsigned long> new_freqs;
+
+			int i, j, len;
+
+			for (j=0, i=0, len=frequencies.size(); i<len; ++i ) {
+				if ( (i==len-1) || ( (i % keep_every) == 0 )) {
+				new_freqs.push_back(frequencies[i]);
+				}
+			}      
+
+			frequencies = new_freqs;
+		}
+		#endif
+
 		frequencies_param_allowed_ = frequencies;
 		//std::cout << "**************** = " << frequency_param_ << std::endl;
 		//for(auto& el: frequencies_param_allowed_)
diff --git a/allscale/components/scheduler.hpp b/allscale/components/scheduler.hpp
index 0980207..9d2ca2c 100644
--- a/allscale/components/scheduler.hpp
+++ b/allscale/components/scheduler.hpp
@@ -93,9 +93,10 @@ namespace allscale { namespace components {
 
 #ifdef MEASURE_
         // convenience methods to update measured metrics of interest
-        void update_active_osthreads(std::size_t);
-        void update_power_consumption(std::size_t);
+        void update_active_osthreads(std::size_t threads, int64_t delta_time);
+        void update_power_consumption(std::size_t power_sample, int64_t delta_time);
 #endif
+        int64_t last_measure_power;
 
         void fix_allcores_frequencies(int index);
 
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index fc8d114..19a05e8 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -301,6 +301,7 @@ actuation localoptimizer::step()
 				act.frequency_idx = minimization_point[1];
 				// VV: Stop searching for new knob_set
 				explore_knob_domain = false;
+				converged_ = true;
 			} else {
 				// VV: Have not converged yet, keep exploring
 				act.delta_threads = nmd_res.threads;
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 4b51c71..94e004c 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -507,8 +507,24 @@ optstepresult NelderMead::do_step_reflect(const double objectives[])
 #ifdef NMD_DEBUG_
     std::cout << "[NelderMead DEBUG] State = Reflection" << std::endl;
 #endif
-    fr = evaluate_score(objectives, opt_weights);
+    // VV: Make sure that we actually profiled what we meant to
+    int profiled_threads = objectives[2];
+
+    if ( (int) vr[0] != profiled_threads ) {
+        std::cout << "[NelderMead|WARN] Meant to profile " << vr[0] << " threads "
+                     "but ended up using " << profiled_threads << std::endl;
+        
+        auto key = std::make_pair((int)vr[0], (int)vr[1]);
+        auto iter = cache_.find(key);
+        if ( iter != cache_.end() ) {
+            iter->second.threads = profiled_threads;
+        }
 
+        vr[0] = profiled_threads;
+    }
+
+    fr = evaluate_score(objectives, opt_weights);
+    
     if ((f[vs] <= fr) && (fr < f[vh]))
     {
         // VV: REFLECTED point is better than the SECOND BEST
@@ -650,6 +666,22 @@ optstepresult NelderMead::do_step_expand(const double objectives[])
 #endif
     fe = evaluate_score(objectives, nullptr);
 
+    // VV: Make sure that we actually profiled what we meant to
+    int profiled_threads = objectives[2];
+
+    if ( (int) ve[0] != profiled_threads ) {
+        std::cout << "[NelderMead|WARN] Meant to profile " << ve[0] << " threads "
+                     "but ended up using " << profiled_threads << std::endl;
+        
+        auto key = std::make_pair((int)ve[0], (int)ve[1]);
+        auto iter = cache_.find(key);
+        if ( iter != cache_.end() ) {
+            iter->second.threads = profiled_threads;
+        }
+
+        ve[0] = profiled_threads;
+    }
+
     if (fe < fr)
     {
         // VV: EXPANDED point is better than REFLECTIVE
@@ -686,6 +718,22 @@ optstepresult NelderMead::do_step_contract(const double objectives[])
 #endif
     fc = evaluate_score(objectives, nullptr);
 
+    // VV: Make sure that we actually profiled what we meant to
+    int profiled_threads = objectives[2];
+
+    if ( (int) vc[0] != profiled_threads ) {
+        std::cout << "[NelderMead|WARN] Meant to profile " << vc[0] << " threads "
+                     "but ended up using " << profiled_threads << std::endl;
+        
+        auto key = std::make_pair((int)vc[0], (int)vc[1]);
+        auto iter = cache_.find(key);
+        if ( iter != cache_.end() ) {
+            iter->second.threads = profiled_threads;
+        }
+
+        vc[0] = profiled_threads;
+    }
+
     if (fc <= fr)
     {
         // VV: CONTRACTED_O is better than REFLECTED
@@ -754,6 +802,16 @@ optstepresult NelderMead::do_step_shrink(const double objectives[])
 #endif
     f[vh] = evaluate_score(objectives, nullptr);
 
+    // VV: Make sure that we actually profiled what we meant to
+    int profiled_threads = objectives[2];
+
+    if ( (int) v[vh][0] != profiled_threads ) {
+        std::cout << "[NelderMead|WARN] Meant to profile " << v[vh][0] << " threads "
+                     "but ended up using " << profiled_threads << std::endl;
+        
+        v[vh][0] = profiled_threads;
+    }
+
     const int threads = (int)(v[vh][0]);
     const int freq_idx = (int)(v[vh][1]);
 
@@ -786,10 +844,18 @@ optstepresult NelderMead::step(const double objectives[])
             std::cout << "[NelderMead|DEBUG] State = Warmup " 
                       << warming_up_step << std::endl;
         #endif
+        // VV: Make sure that we actually profiled what we meant to
+        int profiled_threads = objectives[2];
+
         if ( warming_up_step > 0 ) {
+            if ( (int) v[warming_up_step-1][0] != profiled_threads ) {
+                std::cout << "[NelderMead|WARN] Meant to profile " << vr[0] << " threads "
+                            "but ended up using " << profiled_threads << std::endl;
+                v[warming_up_step-1][0] = profiled_threads;
+            }
             // VV: Record results of last warming up step
             f[warming_up_step-1] = evaluate_score(objectives, nullptr);
-            cache_update(v[warming_up_step-1][0], v[warming_up_step-1][1], 
+            cache_update(profiled_threads, v[warming_up_step-1][1], 
                          objectives, true);
         }
 
@@ -935,6 +1001,7 @@ bool NelderMead::testConvergence(std::size_t tested_combinations)
         warming_up_step = 0;
         itr --;
         convergence_reevaluating = true;
+        cache_.clear();
 
         for (auto i=0; i<NMD_NUM_KNOBS+1; ++i ) {
             for (auto j=0; j<NMD_NUM_KNOBS; ++j) {
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 6323faf..9b522d8 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -205,10 +205,11 @@ void scheduler::init() {
     return;
 
 #ifdef MEASURE_
-  update_active_osthreads(0);
-#ifdef ALLSCALE_HAVE_CPUFREQ
-  update_power_consumption(hardware_reconf::read_system_power());
-#endif
+  last_measure_power = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+// update_active_osthreads(0);
+// #ifdef ALLSCALE_HAVE_CPUFREQ
+//   update_power_consumption(hardware_reconf::read_system_power(), 1);
+// #endif
 #endif
 
   rp_ = &hpx::resource::get_partitioner();
@@ -496,25 +497,7 @@ void scheduler::init() {
     using hardware_reconf = allscale::components::util::hardware_reconf;
     auto  freqs = hardware_reconf::get_frequencies(0);
 
-    const std::size_t max_freqs = 5;
-    std::size_t keep_every = (std::size_t) ceilf(freqs.size() / (float) max_freqs);
-
-    if ( keep_every > 1 ) {
-      std::vector<unsigned long> new_freqs;
-
-      int i, j, len;
-
-      for (j=0, i=0, len=freqs.size(); i<len; ++i ) {
-        if ( (i==len-1) || ( (i % keep_every) == 0 )) {
-          new_freqs.push_back(freqs[i]);
-        }
-      }      
-
-        freqs = new_freqs;
-    }
-
-    std::vector<unsigned long> freq_temp =
-      lopt_.setfrequencies(freqs);
+    auto freq_temp = lopt_.setfrequencies(freqs);
     if (freq_temp.empty()){
       HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init",
       "error in initializing the local optimizer, allowed frequency values are empty");
@@ -769,17 +752,33 @@ void scheduler::optimize_locally(work_item const& work)
 #endif
 
 #ifdef MEASURE_
-#ifdef ALLSCALE_HAVE_CPUFREQ
         std::size_t temp_id = work.id().id;
         if ((temp_id >= period_for_power) && (temp_id % period_for_power == 0))
-            update_power_consumption(hardware_reconf::read_system_power());
+        {
+          auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+          auto dt = timestamp_now - last_measure_power;
+
+          dt = dt > 0 ? dt : 1 ;
+
+          last_measure_power = timestamp_now;
+          
+          update_active_osthreads(active_threads, dt);
+#ifdef ALLSCALE_HAVE_CPUFREQ
+          allscale::components::monitor *monitor_c = &allscale::monitor::get();
+          auto measurement = monitor_c->get_current_power();
+          if ( measurement <= 10000 ) {
+            update_power_consumption(measurement, dt);
+          }
 #endif
+        }
+
 #endif
 
 #ifdef ALLSCALE_HAVE_CPUFREQ
         if (uselopt && !lopt_.isConverged()) {
             last_power_usage++;
-            current_power_usage = hardware_reconf::read_system_power();
+            allscale::components::monitor *monitor_c = &allscale::monitor::get();
+            current_power_usage = monitor_c->get_current_power();
             power_sum += current_power_usage;
 
             auto t_now = std::chrono::system_clock::now();
@@ -804,8 +803,7 @@ void scheduler::optimize_locally(work_item const& work)
                 }
 
                 lopt_.measureObjective(current_avg_iter_time,power_sum/last_power_usage,
-                // active_threads
-                        lopt_.getCurrentThreads());
+                        active_threads);
                 last_power_usage=0;
                 power_sum=0;
             }
@@ -1075,9 +1073,9 @@ unsigned int scheduler::suspend_threads(std::size_t suspendthreads) {
   std::cout << "total active PUs: " << active_threads_ << "\n";
 #endif
 
-#ifdef MEASURE_
-  update_active_osthreads(active_threads_-active_threads);
-#endif
+// #ifdef MEASURE_
+//   update_active_osthreads(active_threads_-active_threads);
+// #endif
 
   active_threads = active_threads_;
 
@@ -1140,9 +1138,9 @@ unsigned int scheduler::suspend_threads(std::size_t suspendthreads) {
             )
         );
   }
-#ifdef MEASURE_
-  update_active_osthreads(-1 * suspend_threads.size());
-#endif
+// #ifdef MEASURE_
+//   update_active_osthreads(-1 * suspend_threads.size());
+// #endif
 
   active_threads = active_threads - suspend_threads.size();
 
@@ -1261,9 +1259,9 @@ unsigned int scheduler::resume_threads(std::size_t resumethreads) {
   std::cout << "total active PUs: " << active_threads_ << "\n";
 #endif
 
-#ifdef MEASURE_
-  update_active_osthreads(active_threads_-active_threads);
-#endif
+// #ifdef MEASURE_
+//   update_active_osthreads(active_threads_-active_threads);
+// #endif
 
   active_threads = active_threads_;
   // if no thread is suspended, nothing to do
@@ -1320,9 +1318,9 @@ unsigned int scheduler::resume_threads(std::size_t resumethreads) {
             )
         );
   }
-#ifdef MEASURE_
-  update_active_osthreads(resume_threads.size());
-#endif
+// #ifdef MEASURE_
+//   update_active_osthreads(resume_threads.size());
+// #endif
   active_threads = active_threads + resume_threads.size();
 #ifdef DEBUG_THREADSTATUS_
   std::cout << "[SCHEDULER|INFO]: Thread Resume - Newly Active Threads: " << active_threads
@@ -1441,51 +1439,31 @@ void scheduler::fix_allcores_frequencies(int frequency_idx){
 #endif
 
 #ifdef MEASURE_
-void scheduler::update_active_osthreads(std::size_t delta) {
-  std::size_t temp = active_threads + delta;
-  if (meas_active_threads_max==0)
-    meas_active_threads_max=temp;
+void scheduler::update_active_osthreads(std::size_t threads, int64_t delta_time) {
 
-  if (meas_active_threads_min==0)
-    meas_active_threads_min=temp;
+  if (meas_active_threads_max==0 || meas_active_threads_max < threads)
+    meas_active_threads_max=threads;
 
-  if (meas_active_threads_sum==0){
-    meas_active_threads_count++;
-    meas_active_threads_sum=active_threads;
-    return;
-  }
+  if (meas_active_threads_min==0 || meas_active_threads_min > threads)
+    meas_active_threads_min=threads;
 
-  if ((temp >= min_threads) && (temp <= os_thread_count)){
-    meas_active_threads_count++;
-    meas_active_threads_sum+=temp;
-    if (temp > meas_active_threads_max)
-      meas_active_threads_max=temp;
-    if (temp < meas_active_threads_min)
-      meas_active_threads_min=temp;
-  }
+  meas_active_threads_count += delta_time;
+  meas_active_threads_sum += active_threads * delta_time;
 }
 
-void scheduler::update_power_consumption(std::size_t power_sample) {
-  if (meas_power_max==0)
+void scheduler::update_power_consumption(std::size_t power_sample, int64_t delta_time)
+{
+  if (meas_power_max==0 || meas_power_max < power_sample)
     meas_power_max=power_sample;
 
-  if (meas_power_min==0)
+  if (meas_power_min==0 || meas_power_min > power_sample)
     meas_power_min=power_sample;
 
-  if (meas_power_sum==0){
-    meas_power_count++;
-    meas_power_sum=power_sample;
-    return;
-  }
 
-  if (power_sample <= 10000){
-    meas_power_count++;
-    meas_power_sum+=power_sample;
-    if (power_sample > meas_power_max)
-      meas_power_max=power_sample;
-    if (power_sample < meas_power_min)
-      meas_power_min=power_sample;
-  }
+  meas_power_count += delta_time;
+  meas_power_sum += power_sample * delta_time;
+
+  std::cout << "Reporting Threads:" << active_threads << " Power:" << power_sample << " for Dt:" << delta_time << std::endl;
 }
 #endif
 
@@ -1554,6 +1532,20 @@ void scheduler::stop() {
   /* Output all measured metrics */
 #ifdef DEBUG_MULTIOBJECTIVE_
 #ifdef MEASURE_
+  auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+  auto dt = timestamp_now - last_measure_power;
+  last_measure_power = timestamp_now;
+
+  update_active_osthreads(active_threads, dt);
+#ifdef ALLSCALE_HAVE_CPUFREQ
+  allscale::components::monitor *monitor_c = &allscale::monitor::get();
+
+  auto measurement = monitor_c->get_current_power();
+  if ( measurement <= 10000 ) {
+    update_power_consumption(measurement, dt);
+  }
+#endif
+
   std::cout << "\n****************************************************\n" << std::flush;
   std::cout << "Measured Metrics of Application Execution:\n"
 

From 6eca192d3717b93faf0ab21d5027189f1992a018 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Thu, 15 Nov 2018 14:41:36 +0000
Subject: [PATCH 09/37] Make monitor::get_current_power() threadsafe

Removed freq_idx domain reduction
  - If we re-enable that feature we probably have to add a map that
    converts a "fake" freq_idx to the actual one
---
 allscale/components/localoptimizer.hpp |  3 ++-
 src/components/monitor_component.cpp   | 22 ++++++++++++++--------
 src/components/nmsimplex_bbincr.cpp    |  5 +++--
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index cfc43b5..e7a77eb 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -155,7 +155,7 @@ struct localoptimizer
 	const std::vector<unsigned long>
 	setfrequencies(std::vector<unsigned long> frequencies)
 	{
-		#if 1
+		#if 0
 		const std::size_t max_freqs = 10;
 		std::size_t keep_every = (std::size_t) ceilf(frequencies.size() / (float) max_freqs);
 
@@ -224,6 +224,7 @@ struct localoptimizer
 	// VV: Used to convert thread_idx to actual number of threads
 	std::size_t threads_dt;
 
+
 	void accumulate_objective_measurements();
 	void reset_accumulated_measurements();
 
diff --git a/src/components/monitor_component.cpp b/src/components/monitor_component.cpp
index 947bac4..41570b1 100644
--- a/src/components/monitor_component.cpp
+++ b/src/components/monitor_component.cpp
@@ -338,19 +338,24 @@ namespace allscale { namespace components {
       /*VV: Read potentially multiple measurements of power within the span of 
             POWER_MEASUREMENT_PERIOD_MS milliseconds. Each time this function
             is invoked it returns the running average of power.*/
-      static unsigned long long times_read_power=1;
-      static unsigned long long power_sum = util::hardware_reconf::read_system_power();
+      static mutex_type power_mtx;
+      static unsigned long long times_read_power=0;
+      static unsigned long long power_sum = 0ull;
+      static long timestamp_reset_power = 0;
 
-      static long timestamp_reset_power = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
-      
-      long t_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+      int64_t t_now, dt;
+      float ret;
 
-      auto dt = t_now - timestamp_reset_power;
+      std::lock_guard<mutex_type> lock(power_mtx);
+      
+      t_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+ 
+      dt = t_now - timestamp_reset_power;
       times_read_power ++;
 
       power_sum += util::hardware_reconf::read_system_power();
 
-      float ret = power_sum / (float)(times_read_power);
+      ret = power_sum / (float)(times_read_power);
 
       if ( dt >= POWER_MEASUREMENT_PERIOD_MS ) {
             times_read_power = 0;
@@ -368,7 +373,8 @@ namespace allscale { namespace components {
    float monitor::get_max_power()
    {
 #if defined(ALLSCALE_HAVE_CPUFREQ)
-      return 0.0;
+      // VV: report 125.0 Watt ( this should be dynamically configured/discovered )
+      return 1250.0;
 #elif defined(POWER_ESTIMATE)
       return allscale::power::estimate_power(get_max_freq(0)) * num_cpus_;
 #else
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 94e004c..54530c8 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -849,8 +849,9 @@ optstepresult NelderMead::step(const double objectives[])
 
         if ( warming_up_step > 0 ) {
             if ( (int) v[warming_up_step-1][0] != profiled_threads ) {
-                std::cout << "[NelderMead|WARN] Meant to profile " << vr[0] << " threads "
-                            "but ended up using " << profiled_threads << std::endl;
+                std::cout << "[NelderMead|WARN] Meant to profile " 
+                        << v[warming_up_step-1] << " threads "
+                        "but ended up using " << profiled_threads << std::endl;
                 v[warming_up_step-1][0] = profiled_threads;
             }
             // VV: Record results of last warming up step

From 55517075088ab126f37c76388f7c764b51df9461 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Thu, 15 Nov 2018 16:59:23 +0000
Subject: [PATCH 10/37] Modifying Score to more closely match the one that
 Dashboard expects

---
 src/components/monitor_component.cpp | 6 +++++-
 src/components/nmsimplex_bbincr.cpp  | 2 +-
 src/dashboard.cpp                    | 4 +---
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/components/monitor_component.cpp b/src/components/monitor_component.cpp
index 594fc39..fde7877 100644
--- a/src/components/monitor_component.cpp
+++ b/src/components/monitor_component.cpp
@@ -397,7 +397,11 @@ namespace allscale { namespace components {
    float monitor::get_max_power()
    {
 #if defined(ALLSCALE_HAVE_CPUFREQ)
-      // VV: report 125.0 Watt ( this should be dynamically configured/discovered )
+      // VV: report 1250 Watts
+      //  ( redbox paper 5283 for 8335-GTA indicates 1875 for the 
+      //   whole node but I've noticed up to ~1100-1200 Watts, for
+      //   the time being this is a good enough figure )
+      //  ( this should be dynamically configured/discovered )
       return 1250.0;
 #elif defined(POWER_ESTIMATE)
       return allscale::power::estimate_power(get_max_freq(0)) * num_cpus_;
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 54530c8..f206cc3 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -284,7 +284,7 @@ double NelderMead::evaluate_score(const double objectives[], const double *weigh
     #else 
     score = 0.0;
     for ( auto i=0; i<NMD_NUM_OBJECTIVES; ++ i) {
-        score += exp(weights[i]*objectives[i]/scale[i]);
+        score *= exp(weights[i]*objectives[i]/scale[i]);
     }
     #endif
     return score;
diff --git a/src/dashboard.cpp b/src/dashboard.cpp
index 4e1c35b..fc8f9f8 100644
--- a/src/dashboard.cpp
+++ b/src/dashboard.cpp
@@ -60,9 +60,7 @@ namespace allscale { namespace dashboard
         state.speed = 1.f - state.idle_rate;
         state.efficiency = state.speed * (float(state.cur_frequency * active_cores) / float(state.max_frequency * state.num_cores));
 
-#ifdef ALLSCALE_HAVE_CPUFREQ
-        state.power = monitor_c->get_current_power();
-#elif defined(POWER_ESTIMATE)
+#if defined(POWER_ESTIMATE) || defined(ALLSCALE_HAVE_CPUFREQ)
         state.cur_power = monitor_c->get_current_power();
         state.max_power = monitor_c->get_max_power();
         state.power = state.cur_power / state.max_power;

From ce0052f693f63fad8a01908b5ffb17c811d8f129 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Fri, 16 Nov 2018 13:00:40 +0000
Subject: [PATCH 11/37] Cleanup and draft for Local-Optimizer integration with
 dashboard

---
 allscale/components/localoptimizer.hpp   |  96 +++---------
 allscale/components/nmsimplex_bbincr.hpp |  48 +++---
 allscale/components/scheduler.hpp        |  28 ++--
 src/components/localoptimizer.cpp        | 159 +++++++------------
 src/components/monitor_component.cpp     |   6 +-
 src/components/nmsimplex_bbincr.cpp      | 108 +++++++------
 src/components/scheduler_component.cpp   | 188 +++++++----------------
 src/optimizer.cpp                        |  15 +-
 src/scheduler.cpp                        |  36 +++++
 9 files changed, 286 insertions(+), 398 deletions(-)

diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index e7a77eb..d708d1d 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -31,12 +31,6 @@ enum objectiveType
 	resource
 };
 
-enum parameterType
-{
-	thread,
-	frequency
-};
-
 enum searchPolicy
 {
 	allscale,
@@ -44,50 +38,12 @@ enum searchPolicy
 	manual
 };
 
-/* structure type of a single optimization objective */
-struct objective
-{
-	double last_scores[3];
-
-	objectiveType type;
-	/* leeway threshold desired, 0-1 double */
-	double leeway;
-	/* non-negative integer priority of the objective, 0 is highest priority*/
-	int priority;
-	/* local minimum during single objective optimization */
-	double localmin;
-	/* local maximum during single objective optimization */
-	double localmax;
-	/* local minimum during single objective optimization */
-	double globalmin;
-	/* local minimum during single objective optimization */
-	double globalmax;
-	/* current deviation of the objective value from observed min */
-	double currentthreshold;
-	/* sampled objective values throughout execution */
-	std::vector<double> samples;
-	/* thread number that lead to the objective value in samples vector */
-	std::vector<double> threads_samples;
-	/* frequency index that lead to the objective value in samples vector */
-	std::vector<double> freq_samples;
-	/* true if optimization of objective has converged, false otherwise */
-	bool converged;
-	/* true if optimizer for objective has been initialized, false otherwise */
-	bool initialized;
-	/* index to the parameter vectors for setup that has so far achieved
-         the minimum over all samples */
-	long int min_params_idx;
-	double converged_minimum;
-	double minimization_params[2];
-};
 
 /* structure type modelling an optimization actuation action to be taken
        by the scheduler */
 struct actuation
 {
-	/* number of threads to resume (>0) or suspend (<0). If set to zero,
-          number of threads will stay unchanged. */
-	unsigned int delta_threads;
+	unsigned int threads;
 
 #if defined(ALLSCALE_HAVE_CPUFREQ)
 	/* index to the global cpu-supported frequencies vector pointing to
@@ -109,16 +65,16 @@ struct localoptimizer
 #if defined(ALLSCALE_HAVE_CPUFREQ)
 		  frequency_param_(0),
 #endif
-		  current_objective_idx_(0), 
 		  converged_(false),
 		  convergence_threshold_(0.01),
+		  time_weight(0.0),
+		  energy_weight(0.0),
+		  resource_weight(0.0),
 		  nmd(0.01)
 	{
 		if (optmethod_ == random)
 			srand(std::time(NULL));
 	}
-	localoptimizer(std::list<objective>);
-
 	bool isConverged();
 
 	void setPolicy(searchPolicy pol)
@@ -132,13 +88,26 @@ struct localoptimizer
 #endif
 	}
 #ifdef ALLSCALE_HAVE_CPUFREQ
-	void initialize_nmd();
+	void initialize_nmd(bool from_scratch);
 #endif
-	double opt_weights[NMD_NUM_OBJECTIVES];
-
 	searchPolicy getPolicy() { return optmethod_; }
 
-	void setobjectives(std::list<objective>);
+	// VV: Modifying the objectives triggers restarting the optimizer
+	void setobjectives(double time_weight, 
+						double energy_weight, 
+						double resource_weight);
+
+	void getobjectives(double *time_weight, 
+					   double *energy_weight,
+					   double *resource_weight)
+	{
+		if ( time_weight != nullptr )
+			*time_weight = this->time_weight;
+		if ( energy_weight != nullptr )
+			*energy_weight = this->energy_weight;
+		if ( resource_weight != nullptr )
+			*resource_weight = this->resource_weight;
+	}
 
 	std::size_t getCurrentThreads() { return threads_param_; }
 
@@ -221,10 +190,11 @@ struct localoptimizer
 	}
 
   private:
+	double time_weight, energy_weight, resource_weight;
+
 	// VV: Used to convert thread_idx to actual number of threads
 	std::size_t threads_dt;
 
-
 	void accumulate_objective_measurements();
 	void reset_accumulated_measurements();
 
@@ -240,15 +210,8 @@ struct localoptimizer
 
 	bool mo_initialized;
 
-	/* vector of active optimization objectives. Objectives are stored
-           in the vector in decreasing priority order */
-	std::vector<objective> objectives_;
-
 	NelderMead nmd;
 
-	/* counts number of parameter changes (as pair) */
-	unsigned long long int param_changes_;
-
 	/* single objective optimization method used */
 	searchPolicy optmethod_ = random;
 
@@ -283,22 +246,9 @@ struct localoptimizer
 
 	/***** optimization state variables ******/
 
-	/* index to the _objectives vector of currently optimized objective */
-	unsigned short int current_objective_idx_;
-
-	/* number of times the optimizer step() has been invoked, this is for
-           init and housekeeping purposes */
-	unsigned long long int steps_;
-
-	/* currently optimized parameter */
-	parameterType current_param_;
-
 	/* initial warm-up steps */
 	const unsigned int warmup_steps_ = 3;
 
-	/* maximum number of optimization steps allowed */
-	const int max_steps_ = 100;
-
 	/* set to true if local optimizer has converged over all objectives */
 	bool converged_;
 };
diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp
index 58844d9..157cb0b 100644
--- a/allscale/components/nmsimplex_bbincr.hpp
+++ b/allscale/components/nmsimplex_bbincr.hpp
@@ -94,10 +94,16 @@ class NelderMead
 	NelderMead(double);
 	// VV: For the time being 
 	//     weights = [ W_time, W_energy/power, W_resources ]
+	//     initial_simplex = double[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS]
 	//     constraint_min = [min_threads, min_freq_idx]
-	void initialize_simplex(double weights[NMD_NUM_OBJECTIVES],
-							double constraint_min[NMD_NUM_KNOBS],
-							double constraint_max[NMD_NUM_KNOBS]);
+	void initialize_simplex(const double weights[NMD_NUM_OBJECTIVES],
+							const double initial_simplex[][NMD_NUM_KNOBS],
+							const double constraint_min[NMD_NUM_KNOBS],
+							const double constraint_max[NMD_NUM_KNOBS]);
+	
+	void initialize_simplex(const double weights[NMD_NUM_OBJECTIVES],
+							const double constraint_min[NMD_NUM_KNOBS],
+							const double constraint_max[NMD_NUM_KNOBS]);
 
 	void print_initial_simplex();
 	void print_iteration();
@@ -112,9 +118,16 @@ class NelderMead
 		return min;
 	}
 
+	// VV: Returns a [NMD_NUM_KNOS+1][NMD_NUM_KNOBS] array
+	void get_simplex(double simplex[][NMD_NUM_KNOBS]) {
+		for (auto i=0; i<NMD_NUM_KNOBS+1; ++i)
+			for (auto j=0; j<NMD_NUM_KNOBS; ++j)
+				simplex[i][j] = v[i][j];
+	}
+
 	unsigned long int getIterations() { return itr; }
 	double evaluate_score(const double objectives[], const double *weights) const;
-	void set_weights(double weights[]);
+	void set_weights(const double weights[]);
 
 	optstepresult step(const double objectives[]);
 
@@ -148,21 +161,6 @@ class NelderMead
 					  const double objectives[],
 					  bool add_if_new);
 
-	double round2(double num, int precision)
-	{
-		double rnum = 0.0;
-		int tnum;
-
-		if (num == 0.0)
-			return num;
-
-		rnum = num * pow(10, precision);
-		tnum = (int)(rnum < 0 ? rnum - 0.5 : rnum + 0.5);
-		rnum = tnum / pow(10, precision);
-
-		return rnum;
-	}
-
 	bool convergence_reevaluating;
 	int initial_configurations[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS];
 	
@@ -186,10 +184,10 @@ class NelderMead
 	int itr;
 
 	/* holds vertices of simplex */
-	double **v;
+	double v[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS];
 
 	/* value of function at each vertex */
-	double *f;
+	double f[NMD_NUM_KNOBS+1];
 
 	/* value of function at reflection point */
 	double fr;
@@ -201,16 +199,16 @@ class NelderMead
 	double fc;
 
 	/* reflection - coordinates */
-	double *vr;
+	double vr[NMD_NUM_KNOBS];
 
 	/* expansion - coordinates */
-	double *ve;
+	double ve[NMD_NUM_KNOBS];
 
 	/* contraction - coordinates */
-	double *vc;
+	double vc[NMD_NUM_KNOBS];
 
 	/* centroid - coordinates */
-	double *vm;
+	double vm[NMD_NUM_KNOBS];
 
 	double min;
 
diff --git a/allscale/components/scheduler.hpp b/allscale/components/scheduler.hpp
index 9d2ca2c..9437eae 100644
--- a/allscale/components/scheduler.hpp
+++ b/allscale/components/scheduler.hpp
@@ -65,7 +65,13 @@ namespace allscale { namespace components {
         {
             return active_threads;
         }
-
+        
+        void set_local_optimizer_weights(double time_weight, 
+                                         double energy_weight,
+                                         double resource_weight);
+        void get_local_optimizer_weights(double *time_weight, 
+                                         double *energy_weight,
+                                         double *resource_weight);
     private:
 
         std::size_t get_num_numa_nodes();
@@ -84,12 +90,11 @@ namespace allscale { namespace components {
         bool do_split(work_item const& work, std::size_t numa_node);
 
         bool collect_counters();
-        //try to suspend resource_step threads, return number of threads which received a new suspend order;
-        // REM unsigned int suspend_threads();
-        unsigned int suspend_threads(std::size_t);
-        //try to resume resource_step threads, return number of threads which received a new resume order;
-        // REM         unsigned int resume_threads();
-        unsigned int resume_threads(std::size_t);
+        //try to suspend threads, return number of threads which received a new suspend order;
+                unsigned int suspend_threads(std::size_t);
+        
+        //try to resume threads, return number of threads which received a new resume order;
+                unsigned int resume_threads(std::size_t);
 
 #ifdef MEASURE_
         // convenience methods to update measured metrics of interest
@@ -163,11 +168,10 @@ namespace allscale { namespace components {
         // Indices correspond to the freq id in cpu_freqs, and
         // each pair holds energy usage and execution time
         std::vector<std::pair<unsigned long long, double>> freq_times;
-        std::vector<std::vector<std::pair<unsigned long long, double>>> objectives_status;
+        
         unsigned int freq_step;
         bool target_freq_found;
 #endif
-        unsigned int resource_step;
         bool target_resource_found;
 
         mutable mutex_type throttle_mtx_;
@@ -189,9 +193,9 @@ namespace allscale { namespace components {
         bool resource_requested;
         bool energy_requested;
 
-        double time_leeway;
-        double resource_leeway;
-        double energy_leeway;
+        double time_weight;
+        double resource_weight;
+        double energy_weight;
         unsigned int period_for_time;
         unsigned int period_for_resource;
         unsigned int period_for_power;
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index 19a05e8..5dfd932 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -24,111 +24,48 @@ namespace allscale
 {
 namespace components
 {
-	#if 0
-localoptimizer::localoptimizer(std::list<objective> targetobjectives)
-	: objectives_((int)targetobjectives.size()),
-	  nmd(convergence_threshold_),
-	  param_changes_(0),
-	  steps_(0),
-	  current_param_(thread),
-	  converged_(false)
+void localoptimizer::setobjectives(double time_weight, 
+								   double energy_weight, 
+								   double resource_weight)
 {
-	for (objective o : targetobjectives)
-	{
-		//std::cout << o.type << "," << o.leeway << "," << o.priority << '\n';
-		objectives_[o.priority] = o;
-		objectives_[o.priority].localmin = 10000;
-		objectives_[o.priority].globalmin = 10000;
-		objectives_[o.priority].localmax = 0.0;
-		objectives_[o.priority].globalmax = 0.0;
-		objectives_[o.priority].converged = false;
-		objectives_[o.priority].initialized = false;
-		objectives_[o.priority].min_params_idx = 0;
-		objectives_[o.priority].converged_minimum = 0;
-	}
-#ifdef ALLSCALE_HAVE_CPUFREQ
-	setCurrentFrequencyIdx(0);
-#endif
-};
-#endif
+	this->time_weight = time_weight;
+	this->energy_weight = energy_weight;
+	this->resource_weight = resource_weight;
 
-void localoptimizer::setobjectives(std::list<objective> targetobjectives)
-{
-	objectives_.clear();
-	objectives_.resize((int)targetobjectives.size());
-
-	explore_knob_domain = true;
-
-	for (objective o : targetobjectives)
-	{
-		//std::cout << o.type << "," << o.leeway << "," << o.priority << '\n';
-		objectives_[o.priority] = o;
-		objectives_[o.priority].localmin = 10000;
-		objectives_[o.priority].globalmin = 10000;
-		objectives_[o.priority].localmax = 0.0;
-		objectives_[o.priority].globalmax = 0.0;
-		objectives_[o.priority].converged = false;
-		objectives_[o.priority].initialized = false;
-		objectives_[o.priority].min_params_idx = 0;
-		objectives_[o.priority].converged_minimum = 0;
-
-		opt_weights[o.type] = o.leeway;
-	}
-	steps_ = 0;
-	param_changes_ = 0;
-	current_param_ = thread;
 #ifdef ALLSCALE_HAVE_CPUFREQ
 	setCurrentFrequencyIdx(0);
 #endif
-	converged_ = false;
+
+	// VV: Modifying the objectives triggers restarting the optimizer
+	//     from scratch
+	initialize_nmd(true);
 }
 
 void localoptimizer::reset(int threads, int freq_idx)
 {
 	threads_param_ = threads;
-	param_changes_ = 0;
 	thread_param_values_.clear();
 #ifdef ALLSCALE_HAVE_CPUFREQ
 	frequency_param_ = freq_idx;
 	frequency_param_values_.clear();
 #endif
-	current_objective_idx_ = 0;
-	steps_ = 0;
-	current_param_ = thread;
 	converged_ = false;
 };
 
 #ifdef DEBUG_
 void localoptimizer::printobjectives()
 {
-	for (auto &el : objectives_)
-	{
-		std::cout << "Objective"
-				  << "\t\t"
-				  << "Priority"
-				  << "\t\t"
-				  << "Leeway" << std::endl;
-		switch (el.type)
-		{
-		case time:
-			std::cout << "Time"
-					  << "\t\t" << el.priority << "\t\t" << el.leeway << std::endl;
-			break;
-		case energy:
-			std::cout << "Energy"
-					  << "\t\t" << el.priority << "\t\t" << el.leeway << std::endl;
-			break;
-		case resource:
-			std::cout << "Resource"
-					  << "\t\t" << el.priority << "\t\t" << el.leeway << std::endl;
-			break;
-		}
-	}
+	std::cout << "[LocalOptimizer|DEBUG] Weights=[time:" << time_weight
+			  << ", energy:" << energy_weight
+			  << ", resource:" << resource_weight << "]" << std::endl;
 }
+#endif
 
 bool localoptimizer::isConverged()
 {	
 	#if 0
+	// VV: This is an attempt to make optimization choices for 
+	//     tasks of smaller granularity (after splitting a task)
 	if ( converged_ == false ) {
 		return false;
 	}
@@ -166,8 +103,6 @@ void localoptimizer::printverbosesteps(actuation act)
 #endif
 }
 
-#endif
-
 void localoptimizer::accumulate_objective_measurements()
 {
 	if (pending_num_times)
@@ -205,16 +140,30 @@ void localoptimizer::setmaxthreads(std::size_t threads)
 }
 
 #ifdef ALLSCALE_HAVE_CPUFREQ
-void localoptimizer::initialize_nmd()
+void localoptimizer::initialize_nmd(bool from_scratch)
 {
-	// VV: Place reasonable limits to #threads and cpu_freq tunable knobs
+	// VV: Place constraints to #threads and cpu_freq tunable knobs
 
 	double constraint_min[] = {1, 0};
 	double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
 							   (double)frequencies_param_allowed_.size() - 1};
+	const double opt_weights[] = { time_weight, energy_weight, resource_weight };
 
-	nmd.initialize_simplex(opt_weights,
-						   constraint_min, constraint_max);
+	if( from_scratch == false ){
+		double prev_simplex[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS];
+	
+		nmd.get_simplex(prev_simplex);
+
+		nmd.initialize_simplex(opt_weights,
+								prev_simplex,
+								constraint_min, 
+								constraint_max);
+	} else {
+		nmd.initialize_simplex(opt_weights,
+								nullptr,
+								constraint_min, 
+								constraint_max);
+	}
 
 	mo_initialized = true;
 	explore_knob_domain = true;
@@ -229,13 +178,10 @@ void localoptimizer::measureObjective(double iter_time, double power, double thr
 			  << power << " "
 			  << threads << std::endl;
 
-	if (steps_)
-	{
-		pending_time += iter_time;
-		pending_energy += power;
-		pending_threads += threads;
-		pending_num_times++;
-	}
+	pending_time += iter_time;
+	pending_energy += power;
+	pending_threads += threads;
+	pending_num_times++;
 }
 
 void localoptimizer::reset_accumulated_measurements()
@@ -248,28 +194,25 @@ void localoptimizer::reset_accumulated_measurements()
 
 actuation localoptimizer::step()
 {
-
-	steps_++;
 	actuation act;
-	act.delta_threads = threads_param_;
+	act.threads = threads_param_;
 #ifdef ALLSCALE_HAVE_CPUFREQ
 	act.frequency_idx = frequency_param_;
 #endif
 	/* random optimization step */
 	if (optmethod_ == random)
 	{
-		act.delta_threads = (rand() % max_threads_);
+		act.threads = (rand() % max_threads_);
 #ifdef ALLSCALE_HAVE_CPUFREQ
 		act.frequency_idx = rand() % frequencies_param_allowed_.size();
-		// if (act.frequency_idx == frequency_param_)
-		//     act.frequency_idx = -1;
 #endif
 	}
 #ifdef ALLSCALE_HAVE_CPUFREQ
 	else if (optmethod_ == allscale)
 	{
+		// VV: Keep track of dirty objectives
 		if (mo_initialized == false)
-			initialize_nmd();
+			initialize_nmd(true);
 				
 		accumulate_objective_measurements();
 		const double latest_measurements[] = {pending_time, 
@@ -297,34 +240,36 @@ actuation localoptimizer::step()
 				std::cout << "[LOCALOPTIMIZER|INFO] Minimal Objective Value = " << min_score << " Threads = " << minimization_point[0] << " Freq_idx = " << minimization_point[1] << std::endl;
 				std::cout << "******************************************" << std::endl;
 #endif
-				act.delta_threads = minimization_point[0];
+				act.threads = minimization_point[0];
 				act.frequency_idx = minimization_point[1];
 				// VV: Stop searching for new knob_set
 				explore_knob_domain = false;
 				converged_ = true;
 			} else {
 				// VV: Have not converged yet, keep exploring
-				act.delta_threads = nmd_res.threads;
+				act.threads = nmd_res.threads;
 				act.frequency_idx = nmd_res.freq_idx;
 			}
 			
-			act.delta_threads *= threads_dt;
+			act.threads *= threads_dt;
+#ifdef DEBUG_MULTIOBJECTIVE_
 			std::cout << "[LOCALOPTIMIZER|DEBUG] ACTUAL Vertex to try:";
-			std::cout << " Threads = " << act.delta_threads;
+			std::cout << " Threads = " << act.threads;
 			std::cout << " Freq Idx = " << act.frequency_idx << std::endl;
+#endif
 		}
 	}
 #endif // ALLSCALE_HAVE_CPUFREQ
 
 validate_act:
 
-	if (act.delta_threads > max_threads_)
+	if (act.threads > max_threads_)
 	{
-		act.delta_threads = max_threads_;
+		act.threads = max_threads_;
 	}
-	else if (act.delta_threads < 1)
+	else if (act.threads < 1)
 	{
-		act.delta_threads = getCurrentThreads();
+		act.threads = getCurrentThreads();
 	}
 #ifdef ALLSCALE_HAVE_CPUFREQ
 	// VV: If freq_idx is -1 then set it to last used frequency (frequency_param_)
diff --git a/src/components/monitor_component.cpp b/src/components/monitor_component.cpp
index fde7877..d1817ae 100644
--- a/src/components/monitor_component.cpp
+++ b/src/components/monitor_component.cpp
@@ -397,12 +397,12 @@ namespace allscale { namespace components {
    float monitor::get_max_power()
    {
 #if defined(ALLSCALE_HAVE_CPUFREQ)
-      // VV: report 1250 Watts
+      // VV: report 1100 Watts
       //  ( redbox paper 5283 for 8335-GTA indicates 1875 for the 
-      //   whole node but I've noticed up to ~1100-1200 Watts, for
+      //   whole node but I've noticed up to ~1100 Watts, for
       //   the time being this is a good enough figure )
       //  ( this should be dynamically configured/discovered )
-      return 1250.0;
+      return 1100.0;
 #elif defined(POWER_ESTIMATE)
       return allscale::power::estimate_power(get_max_freq(0)) * num_cpus_;
 #else
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index f206cc3..864f3fd 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -36,25 +36,9 @@ NelderMead::NelderMead(double eps)
 #endif
     itr = 0;
     state_ = warmup;
-
-    /* dynamically allocate arrays */
-
-    /* allocate the rows of the arrays */
-    v = (double **)malloc((n + 1) * sizeof(double *));
-    f = (double *)malloc((n + 1) * sizeof(double));
-    vr = (double *)malloc(n * sizeof(double));
-    ve = (double *)malloc(n * sizeof(double));
-    vc = (double *)malloc(n * sizeof(double));
-    vm = (double *)malloc(n * sizeof(double));
     
     warming_up_step = 0;
     convergence_reevaluating = false;
-
-    /* allocate the columns of the arrays */
-    for (i = 0; i <= n; i++)
-    {
-        v[i] = (double *)malloc(n * sizeof(double));
-    }
 }
 
 std::pair<int, NelderMead::direction> NelderMead::explore_next_extra(double *extra, int level, 
@@ -196,17 +180,6 @@ void NelderMead::generate_new(F &gen)
 
 void NelderMead::my_constraints(double x[])
 {
-    // round to integer and bring again with allowable margins
-    // todo fix: generalize
-
-    // if (x[0] < constraint_min[0] || x[0] > constraint_max[0]){
-    //   x[0] = (constraint_min[0] + constraint_max[0])/2;
-    // }
-
-    // if (x[1] < constraint_min[1] || x[1] > constraint_max[1]){
-    //   x[1] = (constraint_min[1] + constraint_max[1])/2;
-    // }
-
     for (auto i = 0u; i < 2u; ++i)
     {
         if (x[i] < constraint_min[i])
@@ -268,7 +241,7 @@ double NelderMead::evaluate_score(const double objectives[], const double *weigh
 {
     double score;
     // VV: [time, energy/power, resources]
-    double scale[] = {1.0, 1000.0, 1.0};
+    double scale[] = {1.0, 1100., 1.0};
     scale[2] = (double)constraint_max[0];
 
     if (weights == nullptr)
@@ -282,7 +255,7 @@ double NelderMead::evaluate_score(const double objectives[], const double *weigh
         score += t * t * weights[i];
     }
     #else 
-    score = 0.0;
+    score = 1.0;
     for ( auto i=0; i<NMD_NUM_OBJECTIVES; ++ i) {
         score *= exp(weights[i]*objectives[i]/scale[i]);
     }
@@ -290,7 +263,7 @@ double NelderMead::evaluate_score(const double objectives[], const double *weigh
     return score;
 }
 
-void NelderMead::set_weights(double weights[3])
+void NelderMead::set_weights(const double weights[3])
 {
     opt_weights[0] = weights[0];
     opt_weights[1] = weights[1];
@@ -303,10 +276,9 @@ void NelderMead::set_weights(double weights[3])
     )
 }
 
-/* FIXME: generalize */
-void NelderMead::initialize_simplex(double weights[3],
-                                    double constraint_min[2],
-                                    double constraint_max[2])
+void NelderMead::initialize_simplex(const double weights[3],
+                                    const double constraint_min[2],
+                                    const double constraint_max[2])
 {
     int i, j;
     long timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
@@ -323,21 +295,57 @@ void NelderMead::initialize_simplex(double weights[3],
     warming_up_step = 0;
     convergence_reevaluating = false;
     cache_.clear();
+}
+
+/* FIXME: generalize */
+void NelderMead::initialize_simplex(const double weights[3],
+                                    const double initial_simplex[][NMD_NUM_KNOBS],
+                                    const double constraint_min[2],
+                                    const double constraint_max[2])
+{
+    int i, j;
+    long timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    for (i = 0; i < NMD_NUM_KNOBS; i++)
+    {
+        this->constraint_min[i] = constraint_min[i];
+        this->constraint_max[i] = constraint_max[i];
+    }
 
-    int threads_low = round(0.25 * (constraint_max[0] - constraint_min[1]) 
-                + constraint_min[1]);
-    int threads_med = round(0.5 * (constraint_max[0] - constraint_min[1])
-                + constraint_min[1]);
-    int threads_high = constraint_max[0] * 0.75;
+    set_weights(weights);
+    state_ = warmup;
+    itr = 0;
+    warming_up_step = 0;
+    convergence_reevaluating = false;
+    cache_.clear();
+    if (initial_simplex == nullptr)
+    {
+        int threads_low = round(0.25 * (constraint_max[0] - constraint_min[1]) 
+                    + constraint_min[1]);
+        int threads_med = round(0.5 * (constraint_max[0] - constraint_min[1])
+                    + constraint_min[1]);
+        int threads_high = constraint_max[0] * 0.75;
 
-    initial_configurations[0][0] = threads_low;
-    initial_configurations[0][1] = (int)constraint_min[1];
+        initial_configurations[0][0] = threads_low;
+        initial_configurations[0][1] = (int)constraint_min[1];
 
-    initial_configurations[1][0] = threads_med;
-    initial_configurations[1][1] = (int)constraint_max[1];
+        initial_configurations[1][0] = threads_med;
+        initial_configurations[1][1] = (int)constraint_max[1];
 
-    initial_configurations[2][0] = threads_high;
-    initial_configurations[2][1] = (int)constraint_max[1];
+        initial_configurations[2][0] = threads_high;
+        initial_configurations[2][1] = (int)constraint_max[1];
+    } else {
+        double knob_set[NMD_NUM_KNOBS];
+        for (i=0; i<NMD_NUM_KNOBS+1; ++i ) {
+            for (j=0; j<NMD_NUM_KNOBS; ++j ) {
+                knob_set[j] = initial_simplex[i][j];
+            }
+            my_constraints(knob_set);
+            for (j=0; j<NMD_NUM_KNOBS; ++j ) {
+                initial_configurations[i][j] = (int) knob_set[j];
+            }
+        }
+    }
 }
 
 /* print out the initial values */
@@ -847,7 +855,7 @@ optstepresult NelderMead::step(const double objectives[])
         // VV: Make sure that we actually profiled what we meant to
         int profiled_threads = objectives[2];
 
-        if ( warming_up_step > 0 ) {
+        if ( warming_up_step > 0 && warming_up_step < NMD_NUM_KNOBS + 1) {
             if ( (int) v[warming_up_step-1][0] != profiled_threads ) {
                 std::cout << "[NelderMead|WARN] Meant to profile " 
                         << v[warming_up_step-1] << " threads "
@@ -858,12 +866,14 @@ optstepresult NelderMead::step(const double objectives[])
             f[warming_up_step-1] = evaluate_score(objectives, nullptr);
             cache_update(profiled_threads, v[warming_up_step-1][1], 
                          objectives, true);
-        }
+        } 
 
         if ( warming_up_step == NMD_NUM_KNOBS + 1) {
             // VV: We need not explore the knob_set space anymore
             state_ = start;
             return step(objectives);
+        } else if (warming_up_step > NMD_NUM_KNOBS + 1) {
+            std::cout << "[NelderMead|Warn] Unknown warmup step " << warming_up_step << std::endl;
         }
 
         optstepresult res;
@@ -1010,7 +1020,9 @@ bool NelderMead::testConvergence(std::size_t tested_combinations)
             }
         }
 
-        print_initial_simplex();
+        OUT_DEBUG (
+            print_initial_simplex();
+        )
 
         return false;
     } else {
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 9b522d8..6875533 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -57,7 +57,6 @@ scheduler::scheduler(std::uint64_t rank)
       target_freq_found(false)
 #endif
       ,
-      resource_step(1),
       target_resource_found(false),
       sampling_interval(10),
       current_avg_iter_time(0.0),
@@ -65,9 +64,9 @@ scheduler::scheduler(std::uint64_t rank)
       time_requested(false),
       resource_requested(false),
       energy_requested(false),
-      time_leeway(1.0),
-      resource_leeway(1.0),
-      energy_leeway(1.0),
+      time_weight(0.0),
+      resource_weight(0.0),
+      energy_weight(0.0),
       period_for_time(10),
       period_for_resource(10),
       period_for_power(20),
@@ -193,14 +192,11 @@ std::size_t scheduler::get_num_numa_cores(std::size_t domain) {
  *
 */
 void scheduler::init() {
-
-  std::vector<objectiveType> objectives_priorities;
-  int objectives_priority_idx=0;
-
   std::size_t num_localities = allscale::get_num_localities();
 
   std::unique_lock<mutex_type> l(resize_mtx_);
   hpx::util::ignore_while_checking<std::unique_lock<mutex_type>> il(&l);
+
   if (initialized_)
     return;
 
@@ -281,95 +277,53 @@ void scheduler::init() {
 #ifdef DEBUG_INIT_
       std::cout << "Scheduling Objective provided: " << obj << "\n";
 #endif
-      // Don't scale objectives if none is given
-      double leeway = 1.0;
+      // VV: Don't scale objectives if none is given
+      double opt_weight = 1.0;
 
       if (idx != std::string::npos) {
 #ifdef DEBUG_INIT_
-        std::cout << "Found a leeway, triggering multi-objectives policies\n"
-                  << std::flush;
+        std::cout << "Found an optimization weight, triggering " 
+                     "multi-objectives policies\n" << std::flush;
 #endif
 
         multi_objectives = true;
         obj = objective_str.substr(0, idx);
-        leeway = std::stod(objective_str.substr(idx + 1));
+        opt_weight = std::stod(objective_str.substr(idx + 1));
       }
 
       if (obj == "time") {
           time_requested = true;
-          objectives_priorities.push_back(time);
-#ifdef DEBUG_INIT_
-          std::cout << "Priority[" << objectives_priority_idx << "]=" << objectives_priorities[objectives_priority_idx]
-          << std::endl;
-#endif
-          time_leeway = leeway;
+          time_weight = opt_weight;
 #ifdef DEBUG_INIT_
-          std::cout << "Set time margin to " << time_leeway << "\n" << std::flush;
+          std::cout << "Set time weight to " << time_weight << "\n" << std::flush;
 #endif
-
       } else if (obj == "resource") {
-          resource_requested = true;
-          objectives_priorities.push_back(resource);
+        resource_requested = true;
+        resource_weight = opt_weight;
 #ifdef DEBUG_INIT_
-          std::cout << "Priority[" << objectives_priority_idx << "]=" << objectives_priorities[objectives_priority_idx]
-          << std::endl;
-#endif
-        resource_leeway = leeway;
-#ifdef DEBUG_INIT_
-        std::cout << "Set resource margin to " << resource_leeway << "\n"
+        std::cout << "Set resource weight to " << resource_weight << "\n"
                   << std::flush;
-        ;
 #endif
 
       } else if (obj == "energy") {
-          energy_requested = true;
-          objectives_priorities.push_back(energy);
-#ifdef DEBUG_INIT_
-          std::cout << "Priority[" << objectives_priority_idx << "]=" << objectives_priorities[objectives_priority_idx]
-          << std::endl;
-#endif
-        energy_leeway = leeway;
+        energy_requested = true;
+        energy_weight = opt_weight;
 #ifdef DEBUG_INIT_
-        std::cout << "Set energy margin to " << energy_leeway << "\n"
+        std::cout << "Set energy weight to " << energy_weight << "\n"
                   << std::flush;
-        ;
 #endif
       } else {
-        std::ostringstream all_keys;
-        copy(scheduler::objectives.begin(), scheduler::objectives.end(),
-             std::ostream_iterator<std::string>(all_keys, ","));
-        std::string keys_str = all_keys.str();
-        keys_str.pop_back();
         HPX_THROW_EXCEPTION(
             hpx::bad_request, "scheduler::init",
             boost::str(
-                boost::format("Wrong objective: %s, Valid values: [%s]") % obj %
-                keys_str));
+                boost::format("Wrong objective: Valid values: [time, energy, resource]")));
       }
 
-      if (time_leeway > 1 || resource_leeway > 1 || energy_leeway > 1) {
+      if (time_weight > 2 || resource_weight > 2 || energy_weight > 2
+          || time_weight < -2 || resource_weight < -2 || energy_weight < -2) {
         HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init",
-                            "leeways should be within ]0, 1]");
+                            "Objective weights should be within [-2, 2]");
       }
-      objectives_priority_idx++;
-    }
-  }
-  objectives_priority_idx--;
-
-  /* Reading optional user provided input for granularity (step) of
-     adding/removing resources to/from the runtime (where resource=OS thread) */
-  std::string input_resource_step_str =
-      hpx::get_config_entry("allscale.resource_step", "");
-  if (!input_resource_step_str.empty()) {
-
-    resource_step = std::stoul(input_resource_step_str);
-#ifdef DEBUG_INIT_
-    std::cout << "Resource step provided : " << resource_step << "\n";
-#endif
-    if (resource_step == 0 || resource_step >= os_thread_count) {
-      HPX_THROW_EXCEPTION(
-          hpx::bad_request, "scheduler::init",
-          "resource step should be within ]0, total nb threads[");
     }
   }
 
@@ -400,16 +354,13 @@ void scheduler::init() {
 
 #if defined(ALLSCALE_HAVE_CPUFREQ)
   if (multi_objectives) {
-    // reallocating objectives_status vector of vectors
-    objectives_status.resize(3);
-    for (int i = 0; i < 3; i++) {
-      objectives_status[i].resize(3);
-    }
+
 #ifdef DEBUG_INIT_
     std::cout << "\n****************************************************\n" << std::flush;
-    std::cout << "Policy selected: multi-objective set with time=" << time_leeway
-              << ", resource=" << resource_leeway
-              << ", energy=" << energy_leeway << "\n"
+    std::cout << "Policy selected: multi-objective set with time=" << time_weight
+              << ", energy=" << energy_weight 
+              << ", resource=" << resource_weight
+              << "\n"
               << std::flush;
     std::cout << "Objectives Flags Set: \n" <<
               "\tTime: " << time_requested <<
@@ -447,53 +398,9 @@ void scheduler::init() {
     last_optimization_timestamp_ = t_duration_now;
     last_objective_measurement_timestamp_= t_duration_now;
 
-    std::list<objective> objectives_temp;
-    if (energy_requested){
-      objective o_temp;
-      o_temp.type=energy;
-      o_temp.leeway=energy_leeway;
-      int i=0;
-      for(auto& el: objectives_priorities){
-        if (el==energy){
-          o_temp.priority=i;
-          break;
-        }
-        ++i;
-      }
-      objectives_temp.push_back(o_temp);
-    }
-    if (time_requested){
-      objective o_temp;
-      o_temp.type=time;
-      o_temp.leeway=time_leeway;
-      int i=0;
-      for(auto& el: objectives_priorities){
-        if (el==time){
-          o_temp.priority=i;
-          break;
-        }
-        ++i;
-      }
-      objectives_temp.push_back(o_temp);
-    }
-    if (resource_requested){
-      objective o_temp;
-      o_temp.type=resource;
-      o_temp.leeway=resource_leeway;
-      int i=0;
-      for(auto& el: objectives_priorities){
-        if (el==resource){
-          o_temp.priority=i;
-          break;
-        }
-        ++i;
-      }
-      objectives_temp.push_back(o_temp);
-    }
-    lopt_.setobjectives(objectives_temp);
     lopt_.setmaxthreads(os_thread_count);
-    lopt_.reset(os_thread_count,0);
-  #if defined(ALLSCALE_HAVE_CPUFREQ)
+
+ #if defined(ALLSCALE_HAVE_CPUFREQ)
     using hardware_reconf = allscale::components::util::hardware_reconf;
     auto  freqs = hardware_reconf::get_frequencies(0);
 
@@ -502,7 +409,16 @@ void scheduler::init() {
       HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init",
       "error in initializing the local optimizer, allowed frequency values are empty");
     }
-  #endif
+    // VV: Set to max number of threads and max frequency
+    lopt_.reset(os_thread_count, freqs.size()-1);
+#else
+    // VV: Max number of threads, and an arbitrary frequency index
+    lopt_.reset(os_thread_count,0);
+#endif
+    
+    // VV: Set objectives after setting all constraints to
+    //     trigger the initialization of nmd
+    lopt_.setobjectives(time_weight, energy_weight, resource_weight);
 #ifdef DEBUG_
     lopt_.printobjectives();
 #endif
@@ -819,32 +735,46 @@ void scheduler::optimize_locally(work_item const& work)
 #endif
                 // amend threads if signaled
                 
-                if (act_temp.delta_threads < active_threads){
+                if (act_temp.threads < active_threads){
 #ifdef DEBUG_MULTIOBJECTIVE_
                     std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << " out of " << lopt_.getmaxthreads() 
-                    << " , target threads = " << act_temp.delta_threads << std::endl;
+                    << " , target threads = " << act_temp.threads << std::endl;
 
 #endif    
                     //unsigned int suspended_temp = suspend_threads(new_threads_target);
                     //lopt_.setCurrentThreads(lopt_.getCurrentThreads()-suspended_temp);
-                    suspend_threads(active_threads-act_temp.delta_threads);
+                    suspend_threads(active_threads-act_temp.threads);
                 }
-                else if (act_temp.delta_threads > active_threads){
+                else if (act_temp.threads > active_threads){
 #ifdef DEBUG_MULTIOBJECTIVE_
                     std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << " out of " << lopt_.getmaxthreads() 
-                    << " , target threads = " << act_temp.delta_threads << std::endl;
+                    << " , target threads = " << act_temp.threads << std::endl;
 #endif
-                    resume_threads(act_temp.delta_threads - active_threads);
+                    resume_threads(act_temp.threads - active_threads);
                 }
                 fix_allcores_frequencies(act_temp.frequency_idx);
                 lopt_.setCurrentFrequencyIdx(act_temp.frequency_idx);
-                lopt_.setCurrentThreads(act_temp.delta_threads);
+                lopt_.setCurrentThreads(act_temp.threads);
             }
         } // uselopt
 #endif
     }
 }
 
+void scheduler::set_local_optimizer_weights(double time_weight, 
+                                         double energy_weight,
+                                         double resource_weight)
+{
+    lopt_.setobjectives(time_weight, energy_weight, resource_weight);
+}
+
+void scheduler::get_local_optimizer_weights(double *time_weight,
+                                           double *energy_weight,
+                                           double *resource_weight)
+{
+    lopt_.getobjectives(time_weight, energy_weight, resource_weight);
+}
+
 std::pair<work_item, std::unique_ptr<data_item_manager::task_requirements_base>> scheduler::schedule_local(work_item work,
         std::unique_ptr<data_item_manager::task_requirements_base>&& reqs,
         runtime::HierarchyAddress const& addr)
diff --git a/src/optimizer.cpp b/src/optimizer.cpp
index fc90f8b..e919fb6 100644
--- a/src/optimizer.cpp
+++ b/src/optimizer.cpp
@@ -126,6 +126,19 @@ tuning_objective get_default_objective()
         return tuning_objective::efficiency();
     if (obj == "power")
         return tuning_objective::power();
+    if ( obj == "local") {
+        double time_weight, energy_weight, resource_weight;
+        
+        auto &&local_scheduler = scheduler::get();
+
+        local_scheduler.get_local_optimizer_weights(&time_weight,
+                                                    &energy_weight,
+                                                    &resource_weight);
+        // VV: If the local-optimizer is used too then copy its objectives
+        return tuning_objective(time_weight, 
+                                resource_weight, 
+                                energy_weight);
+    }
 
     float speed = 0.0f;
     float efficiency = 0.0f;
@@ -233,7 +246,7 @@ void global_optimizer::tune(std::vector<optimizer_state> const &state)
             total_efficiency += state[i].load_ * (float(state[i].active_frequency_ * state[i].cores_per_node_) / float(max_frequency * state[i].cores_per_node_));;
             used_power += state[i].energy_;
         }
-#ifdef POWER_ESTIMATE
+#if defined(POWER_ESTIMATE) || defined(ALLSCALE_HAVE_CPUFREQ)
         max_power += monitor_c->get_max_power();
 #endif
     }
diff --git a/src/scheduler.cpp b/src/scheduler.cpp
index 485abc9..d88d568 100644
--- a/src/scheduler.cpp
+++ b/src/scheduler.cpp
@@ -370,18 +370,54 @@ namespace allscale
         {
             std::lock_guard<mutex_type> l(optimizer_.mtx_);
             optimizer_.objective_.speed_exponent = exp;
+            double time_weight, energy_weight, resource_weight;
+
+            auto &&local_scheduler = scheduler::get();
+
+            local_scheduler.get_local_optimizer_weights(&time_weight,
+                                                        &energy_weight,
+                                                        &resource_weight);
+            time_weight = (double) exp;
+
+            local_scheduler.set_local_optimizer_weights(time_weight,
+                                                        energy_weight,
+                                                        resource_weight);
         }
 
         void set_efficiency_exponent(float exp)
         {
             std::lock_guard<mutex_type> l(optimizer_.mtx_);
             optimizer_.objective_.efficiency_exponent = exp;
+            double time_weight, energy_weight, resource_weight;
+
+            auto &&local_scheduler = scheduler::get();
+
+            local_scheduler.get_local_optimizer_weights(&time_weight,
+                                                        &energy_weight,
+                                                        &resource_weight);
+            resource_weight = (double) exp;
+
+            local_scheduler.set_local_optimizer_weights(time_weight,
+                                                        energy_weight,
+                                                        resource_weight);
         }
 
         void set_power_exponent(float exp)
         {
             std::lock_guard<mutex_type> l(optimizer_.mtx_);
             optimizer_.objective_.power_exponent = exp;
+            double time_weight, energy_weight, resource_weight;
+            
+            auto &&local_scheduler = scheduler::get();
+
+            local_scheduler.get_local_optimizer_weights(&time_weight,
+                                                        &energy_weight,
+                                                        &resource_weight);
+            energy_weight = (double) exp;
+
+            local_scheduler.set_local_optimizer_weights(time_weight,
+                                                        energy_weight,
+                                                        resource_weight);
         }
 
         hpx::util::tuple<float, float, float> get_optimizer_exponents()

From d1ea9d64b7f2c817496a698cbdde44c519f3377a Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Fri, 16 Nov 2018 14:48:30 +0000
Subject: [PATCH 12/37] Report avg_iteration_time as "speed" and number of
 threads as "efficiency"

- This is only the case when CPUFreq is used
- I've also modified the dashboard
---
 src/components/nmsimplex_bbincr.cpp | 11 +++++++++--
 src/dashboard.cpp                   |  5 +++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 864f3fd..94cea8a 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -139,6 +139,7 @@ void NelderMead::generate_new(F &gen)
 
     max_level *= 2;
     max_nested_level *=2;
+    auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
 
     int is_same;
     do
@@ -147,7 +148,13 @@ void NelderMead::generate_new(F &gen)
         
         auto key = std::make_pair((int)new_set[0], (int)new_set[1]);
         auto entry = cache_.find(key);
-        is_same = (entry != cache_.end());
+        
+        is_same = 0;
+
+        if ( entry != cache_.end() ) {
+            auto dt = timestamp_now - entry->second._cache_timestamp;
+            is_same = dt <= entry->second._cache_expires_dt;
+        }
 
         if ( ( level < max_level +1) 
              && is_same 
@@ -281,7 +288,7 @@ void NelderMead::initialize_simplex(const double weights[3],
                                     const double constraint_max[2])
 {
     int i, j;
-    long timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+    auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
 
     for (i = 0; i < NMD_NUM_KNOBS; i++)
     {
diff --git a/src/dashboard.cpp b/src/dashboard.cpp
index fc8f9f8..99bd6fb 100644
--- a/src/dashboard.cpp
+++ b/src/dashboard.cpp
@@ -57,8 +57,13 @@ namespace allscale { namespace dashboard
 
         state.productive_cycles_per_second = float(state.cur_frequency) * (1.f - state.idle_rate);  // freq to Hz
 
+#ifdef ALLSCALE_HAVE_CPUFREQ
+        state.speed = monitor_c->get_avg_time_last_iterations(100);
+        state.efficiency = active_cores;
+#else
         state.speed = 1.f - state.idle_rate;
         state.efficiency = state.speed * (float(state.cur_frequency * active_cores) / float(state.max_frequency * state.num_cores));
+#endif
 
 #if defined(POWER_ESTIMATE) || defined(ALLSCALE_HAVE_CPUFREQ)
         state.cur_power = monitor_c->get_current_power();

From 711479b3ec72e0d740a89393fe06878f6bbdd35a Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Fri, 16 Nov 2018 20:16:08 +0000
Subject: [PATCH 13/37] Randomize initial simplex

---
 allscale/components/nmsimplex_bbincr.hpp |  5 +-
 src/components/localoptimizer.cpp        |  6 +-
 src/components/nmsimplex_bbincr.cpp      | 70 ++++++++++++++++++++----
 src/components/scheduler_component.cpp   | 32 +++++------
 4 files changed, 80 insertions(+), 33 deletions(-)

diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp
index 157cb0b..441041e 100644
--- a/allscale/components/nmsimplex_bbincr.hpp
+++ b/allscale/components/nmsimplex_bbincr.hpp
@@ -66,7 +66,6 @@ struct optstepresult
 	int freq_idx;
 
 	/******VV: Cache stuff******/
-	double score;
 	double objectives[3]; // (time, energy, resource)
 	// VV: _cache_expires denotes dt (in ms) after _cache_timestamp
 	int64_t _cache_timestamp, _cache_expires_dt;
@@ -126,7 +125,7 @@ class NelderMead
 	}
 
 	unsigned long int getIterations() { return itr; }
-	double evaluate_score(const double objectives[], const double *weights) const;
+	double evaluate_score(const double objectives[], const double *weights);
 	void set_weights(const double weights[]);
 
 	optstepresult step(const double objectives[]);
@@ -134,6 +133,8 @@ class NelderMead
   private:
 	int warming_up_step;
 
+	double max_power_, max_time_;
+
 	// VV: Utility to make sure that we generate new values and not something that already
 	//     exists in the set of NMD_NUM_KNOBS+1 configuration points
 	template <typename F>
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index 5dfd932..8dde2fa 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -38,7 +38,9 @@ void localoptimizer::setobjectives(double time_weight,
 
 	// VV: Modifying the objectives triggers restarting the optimizer
 	//     from scratch
-	initialize_nmd(true);
+	
+	mo_initialized = false;
+	converged_ = false;
 }
 
 void localoptimizer::reset(int threads, int freq_idx)
@@ -252,6 +254,8 @@ actuation localoptimizer::step()
 			}
 			
 			act.threads *= threads_dt;
+
+			threads_param_ = act.threads;
 #ifdef DEBUG_MULTIOBJECTIVE_
 			std::cout << "[LOCALOPTIMIZER|DEBUG] ACTUAL Vertex to try:";
 			std::cout << " Threads = " << act.threads;
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 94cea8a..e09c685 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -37,6 +37,9 @@ NelderMead::NelderMead(double eps)
     itr = 0;
     state_ = warmup;
     
+    max_power_ = 900.;
+    max_time_ = 3.2;
+
     warming_up_step = 0;
     convergence_reevaluating = false;
 }
@@ -244,11 +247,17 @@ bool NelderMead::cache_update(int threads, int freq_idx,
     return false;
 }
 
-double NelderMead::evaluate_score(const double objectives[], const double *weights) const
+double NelderMead::evaluate_score(const double objectives[], const double *weights)
 {
     double score;
     // VV: [time, energy/power, resources]
-    double scale[] = {1.0, 1100., 1.0};
+    double scale[] = {1.0, 1.0, 1.0};
+    
+    max_time_ = max_time_ > objectives[0] ? max_time_ : objectives[0];
+    max_power_ = max_power_ > objectives[2] ? max_power_ : objectives[2];
+
+    scale[0] = max_time_;
+    scale[1] = max_power_;
     scale[2] = (double)constraint_max[0];
 
     if (weights == nullptr)
@@ -327,6 +336,7 @@ void NelderMead::initialize_simplex(const double weights[3],
     cache_.clear();
     if (initial_simplex == nullptr)
     {
+        #if 0
         int threads_low = round(0.25 * (constraint_max[0] - constraint_min[1]) 
                     + constraint_min[1]);
         int threads_med = round(0.5 * (constraint_max[0] - constraint_min[1])
@@ -336,11 +346,38 @@ void NelderMead::initialize_simplex(const double weights[3],
         initial_configurations[0][0] = threads_low;
         initial_configurations[0][1] = (int)constraint_min[1];
 
-        initial_configurations[1][0] = threads_med;
-        initial_configurations[1][1] = (int)constraint_max[1];
+        initial_configurations[1][0] = threads_high;
+        initial_configurations[1][1] = (int)constraint_min[1];
 
         initial_configurations[2][0] = threads_high;
         initial_configurations[2][1] = (int)constraint_max[1];
+        #else
+        for (i=0; i<NMD_NUM_KNOBS+1; ++i) {
+            int is_ok = 1;
+            do {
+                
+                for (j=0; j<NMD_NUM_KNOBS; ++j)
+                    initial_configurations[i][j] = constraint_min[j] + rand() % (int) (constraint_max[j] - constraint_min[j]+1);
+                
+                is_ok = 1;
+
+                for (auto c=0; c<i && is_ok == 1; ++c)
+                {
+                    is_ok = 0;
+                    for ( j=0; j<NMD_NUM_KNOBS; ++j )
+                        is_ok |= (initial_configurations[c][j] != initial_configurations[i][j]);
+                }
+
+            } while (is_ok == 0);
+
+            OUT_DEBUG(
+                std::cout << "[NelderMead|DEBUG] Random initial simplex [" << i << "]: ";
+                for ( j =0; j<NMD_NUM_KNOBS; ++j) 
+                    std::cout << initial_configurations[i][j] << " ";
+                std::cout << std::endl;
+            )
+        }
+        #endif
     } else {
         double knob_set[NMD_NUM_KNOBS];
         for (i=0; i<NMD_NUM_KNOBS+1; ++i ) {
@@ -479,7 +516,7 @@ optstepresult NelderMead::do_step_start()
     auto gen_new = [this](double *extra) mutable -> double* {
         
         for (j = 0; j < NMD_NUM_KNOBS; j++)
-            vr[j] = vm[j] + ALPHA * (vm[j] - v[vg][j]) + extra[j];
+            vr[j] = vm[j] + ALPHA * (vm[j] - v[vg][j]) - extra[j];
        
         my_constraints(vr);
 
@@ -567,7 +604,7 @@ optstepresult NelderMead::do_step_reflect(const double objectives[])
         // VV: REFLECTED is better than BEST
         auto gen_new = [this](double *extra) mutable -> double* {
             for (j = 0; j < NMD_NUM_KNOBS; j++)
-                ve[j] = vm[j] + GAMMA * (vr[j] - vm[j]) + extra[j];
+                ve[j] = vm[j] + GAMMA * (vr[j] - vm[j]) - extra[j];
                 
             my_constraints(ve);
 
@@ -604,7 +641,7 @@ optstepresult NelderMead::do_step_reflect(const double objectives[])
         // VV: REFLECTED between SECOND BEST and WORST
         auto gen_new = [this](double *extra) mutable -> double* {
             for (j = 0; j < NMD_NUM_KNOBS; j++)
-                vc[j] = vm[j] + BETA * (vr[j] - vm[j]) + extra[j];
+                vc[j] = vm[j] + BETA * (vr[j] - vm[j]) - extra[j];
                 
             my_constraints(vc);
 
@@ -641,7 +678,7 @@ optstepresult NelderMead::do_step_reflect(const double objectives[])
         // VV: REFLECTED worse than WORST
         auto gen_new = [this](double *extra) mutable -> double* {
             for (j = 0; j < NMD_NUM_KNOBS; j++)
-                vc[j] = vm[j] - BETA * (vr[j] - vm[j]) + extra[j];
+                vc[j] = vm[j] - BETA * (vr[j] - vm[j]) - extra[j];
                 
             my_constraints(vc);
 
@@ -772,7 +809,7 @@ optstepresult NelderMead::do_step_contract(const double objectives[])
         
         auto gen_new = [this, &new_vh](double *extra) mutable -> double* {
             for (auto j = 0; j < NMD_NUM_KNOBS; j++)
-                new_vh[j] = v[vs][j] + DELTA * (v[vh][j] - v[vs][j]) + extra[j];
+                new_vh[j] = v[vs][j] + DELTA * (v[vh][j] - v[vs][j]) - extra[j];
                 
             my_constraints(new_vh);
 
@@ -851,6 +888,17 @@ optstepresult NelderMead::step(const double objectives[])
     
     std::size_t tested_combinations = cache_.size();
 
+    evaluate_score(objectives, nullptr);
+
+    for (i=0; i<NMD_NUM_KNOBS+1; ++i) {
+        auto key = std::make_pair((int)v[i][0], (int)v[i][1]);
+        auto entry = cache_.find(key);
+
+        if ( entry != cache_.end() ) {
+            f[i] = evaluate_score(entry->second.objectives, nullptr);
+        }
+    }
+
     switch (state_)
     {
     case warmup:
@@ -865,7 +913,7 @@ optstepresult NelderMead::step(const double objectives[])
         if ( warming_up_step > 0 && warming_up_step < NMD_NUM_KNOBS + 1) {
             if ( (int) v[warming_up_step-1][0] != profiled_threads ) {
                 std::cout << "[NelderMead|WARN] Meant to profile " 
-                        << v[warming_up_step-1] << " threads "
+                        << v[warming_up_step-1][0] << " threads "
                         "but ended up using " << profiled_threads << std::endl;
                 v[warming_up_step-1][0] = profiled_threads;
             }
@@ -888,7 +936,7 @@ optstepresult NelderMead::step(const double objectives[])
         res.objectives[1] = -1;
         res.objectives[2] = -1;
         res.converged = false;
-        res.score = -1;
+
         res.threads = initial_configurations[warming_up_step][0];
         res.freq_idx = initial_configurations[warming_up_step][1];
         
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 6875533..8bf9946 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -569,7 +569,7 @@ void scheduler::initialize_cpu_frequencies() {
 
   // Make sure frequency change happened before continuing
   std::cout << "topo.num_logical_cores: " << topo.num_logical_cores
-            << "topo.num_hw_threads" << topo.num_hw_threads << "\n"
+            << " topo.num_hw_threads" << topo.num_hw_threads << "\n"
             << std::flush;
   {
     // check status of Pus frequency
@@ -647,7 +647,6 @@ void scheduler::optimize_locally(work_item const& work)
         // find out which pool has the most threads
 
         /* Count Active threads for validation*/
-
         hpx::threads::mask_type active_mask;
         std::size_t domain_active_threads = 0;
         std::size_t pool_idx = 0;
@@ -665,15 +664,16 @@ void scheduler::optimize_locally(work_item const& work)
             }
         }
         std::cout << "Active OS Threads = " <<  total_threads_counted << std::endl;
+
 #endif
 
 #ifdef MEASURE_
-        std::size_t temp_id = work.id().id;
-        if ((temp_id >= period_for_power) && (temp_id % period_for_power == 0))
-        {
-          auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
-          auto dt = timestamp_now - last_measure_power;
+        auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+        auto dt = timestamp_now - last_measure_power;
 
+        if (dt >= 5000)
+        {
+          
           dt = dt > 0 ? dt : 1 ;
 
           last_measure_power = timestamp_now;
@@ -736,25 +736,19 @@ void scheduler::optimize_locally(work_item const& work)
                 // amend threads if signaled
                 
                 if (act_temp.threads < active_threads){
-#ifdef DEBUG_MULTIOBJECTIVE_
-                    std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << " out of " << lopt_.getmaxthreads() 
-                    << " , target threads = " << act_temp.threads << std::endl;
-
-#endif    
-                    //unsigned int suspended_temp = suspend_threads(new_threads_target);
-                    //lopt_.setCurrentThreads(lopt_.getCurrentThreads()-suspended_temp);
                     suspend_threads(active_threads-act_temp.threads);
                 }
                 else if (act_temp.threads > active_threads){
-#ifdef DEBUG_MULTIOBJECTIVE_
-                    std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << " out of " << lopt_.getmaxthreads() 
-                    << " , target threads = " << act_temp.threads << std::endl;
-#endif
                     resume_threads(act_temp.threads - active_threads);
                 }
                 fix_allcores_frequencies(act_temp.frequency_idx);
                 lopt_.setCurrentFrequencyIdx(act_temp.frequency_idx);
-                lopt_.setCurrentThreads(act_temp.threads);
+                lopt_.setCurrentThreads(active_threads);
+
+#ifdef DEBUG_MULTIOBJECTIVE_
+                    std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << " out of " << lopt_.getmaxthreads() 
+                    << " , target threads = " << act_temp.threads << ", set threads to " << active_threads << std::endl;
+#endif
             }
         } // uselopt
 #endif

From 9b30eac1c1a9a14a75abd2dda8516ae84e04779c Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Sun, 18 Nov 2018 15:47:27 +0000
Subject: [PATCH 14/37] Better logistics and modified objective score slightly

---
 allscale/components/scheduler.hpp      |  2 +-
 src/components/nmsimplex_bbincr.cpp    | 10 ++---
 src/components/scheduler_component.cpp | 61 +++++++++++++-------------
 3 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/allscale/components/scheduler.hpp b/allscale/components/scheduler.hpp
index 9437eae..5ff8848 100644
--- a/allscale/components/scheduler.hpp
+++ b/allscale/components/scheduler.hpp
@@ -101,7 +101,7 @@ namespace allscale { namespace components {
         void update_active_osthreads(std::size_t threads, int64_t delta_time);
         void update_power_consumption(std::size_t power_sample, int64_t delta_time);
 #endif
-        int64_t last_measure_power;
+        int64_t last_measure_power, last_measure_threads;
 
         void fix_allcores_frequencies(int index);
 
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index e09c685..2b6820b 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -37,8 +37,8 @@ NelderMead::NelderMead(double eps)
     itr = 0;
     state_ = warmup;
     
-    max_power_ = 900.;
-    max_time_ = 3.2;
+    max_power_ = 1.0;
+    max_time_ = 30.0;
 
     warming_up_step = 0;
     convergence_reevaluating = false;
@@ -253,8 +253,8 @@ double NelderMead::evaluate_score(const double objectives[], const double *weigh
     // VV: [time, energy/power, resources]
     double scale[] = {1.0, 1.0, 1.0};
     
-    max_time_ = max_time_ > objectives[0] ? max_time_ : objectives[0];
-    max_power_ = max_power_ > objectives[2] ? max_power_ : objectives[2];
+    // max_time_ = max_time_ > objectives[0] ? max_time_ : objectives[0];
+    // max_power_ = max_power_ > objectives[2] ? max_power_ : objectives[2];
 
     scale[0] = max_time_;
     scale[1] = max_power_;
@@ -910,7 +910,7 @@ optstepresult NelderMead::step(const double objectives[])
         // VV: Make sure that we actually profiled what we meant to
         int profiled_threads = objectives[2];
 
-        if ( warming_up_step > 0 && warming_up_step < NMD_NUM_KNOBS + 1) {
+        if ( warming_up_step > 0 && warming_up_step <= NMD_NUM_KNOBS + 1) {
             if ( (int) v[warming_up_step-1][0] != profiled_threads ) {
                 std::cout << "[NelderMead|WARN] Meant to profile " 
                         << v[warming_up_step-1][0] << " threads "
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 8bf9946..8568988 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -202,6 +202,7 @@ void scheduler::init() {
 
 #ifdef MEASURE_
   last_measure_power = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+  last_measure_threads = last_measure_power;
 // update_active_osthreads(0);
 // #ifdef ALLSCALE_HAVE_CPUFREQ
 //   update_power_consumption(hardware_reconf::read_system_power(), 1);
@@ -667,29 +668,6 @@ void scheduler::optimize_locally(work_item const& work)
 
 #endif
 
-#ifdef MEASURE_
-        auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
-        auto dt = timestamp_now - last_measure_power;
-
-        if (dt >= 5000)
-        {
-          
-          dt = dt > 0 ? dt : 1 ;
-
-          last_measure_power = timestamp_now;
-          
-          update_active_osthreads(active_threads, dt);
-#ifdef ALLSCALE_HAVE_CPUFREQ
-          allscale::components::monitor *monitor_c = &allscale::monitor::get();
-          auto measurement = monitor_c->get_current_power();
-          if ( measurement <= 10000 ) {
-            update_power_consumption(measurement, dt);
-          }
-#endif
-        }
-
-#endif
-
 #ifdef ALLSCALE_HAVE_CPUFREQ
         if (uselopt && !lopt_.isConverged()) {
             last_power_usage++;
@@ -704,6 +682,9 @@ void scheduler::optimize_locally(work_item const& work)
 
             long elapsedTimeMs = t_duration_now - last_objective_measurement_timestamp_;
 
+            auto dt_power = t_duration_now - last_measure_power;
+            update_power_consumption(power_sum/last_power_usage, dt_power);
+
             if (elapsedTimeMs > objective_measurement_period_ms){
                 last_objective_measurement_timestamp_= t_duration_now;
 
@@ -718,7 +699,7 @@ void scheduler::optimize_locally(work_item const& work)
                     current_avg_iter_time = 0.0;
                 }
 
-                lopt_.measureObjective(current_avg_iter_time,power_sum/last_power_usage,
+                lopt_.measureObjective(current_avg_iter_time,power_sum/(last_power_usage*monitor_c->get_max_power()),
                         active_threads);
                 last_power_usage=0;
                 power_sum=0;
@@ -733,8 +714,9 @@ void scheduler::optimize_locally(work_item const& work)
 #ifdef DEBUG_MULTIOBJECTIVE_
                 lopt_.printverbosesteps(act_temp);
 #endif
-                // amend threads if signaled
-                
+                auto dt_threads = t_duration_now - last_measure_threads;
+                update_active_osthreads(active_threads, dt_threads);
+                last_measure_threads = t_duration_now;
                 if (act_temp.threads < active_threads){
                     suspend_threads(active_threads-act_temp.threads);
                 }
@@ -750,7 +732,20 @@ void scheduler::optimize_locally(work_item const& work)
                     << " , target threads = " << act_temp.threads << ", set threads to " << active_threads << std::endl;
 #endif
             }
-        } // uselopt
+        } 
+    #ifdef MEASURE_
+        else {
+          auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+          auto dt = timestamp_now - last_measure_power;
+          if ( dt >= 1000 ) {
+            allscale::components::monitor *monitor_c = &allscale::monitor::get();
+            auto cur_power = monitor_c->get_current_power();
+
+            update_power_consumption(cur_power, dt);
+            last_measure_power = timestamp_now;
+          }
+        }
+    #endif
 #endif
     }
 }
@@ -1377,6 +1372,9 @@ void scheduler::update_active_osthreads(std::size_t threads, int64_t delta_time)
 
 void scheduler::update_power_consumption(std::size_t power_sample, int64_t delta_time)
 {
+  if ( power_sample > 10000)
+    return;
+  
   if (meas_power_max==0 || meas_power_max < power_sample)
     meas_power_max=power_sample;
 
@@ -1457,16 +1455,19 @@ void scheduler::stop() {
 #ifdef DEBUG_MULTIOBJECTIVE_
 #ifdef MEASURE_
   auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
-  auto dt = timestamp_now - last_measure_power;
+  auto dt_threads = timestamp_now - last_measure_threads;
+  auto dt_power = timestamp_now - last_measure_power;
+
   last_measure_power = timestamp_now;
+  last_measure_threads = timestamp_now;
 
-  update_active_osthreads(active_threads, dt);
+  update_active_osthreads(active_threads, dt_threads);
 #ifdef ALLSCALE_HAVE_CPUFREQ
   allscale::components::monitor *monitor_c = &allscale::monitor::get();
 
   auto measurement = monitor_c->get_current_power();
   if ( measurement <= 10000 ) {
-    update_power_consumption(measurement, dt);
+    update_power_consumption(measurement, dt_power);
   }
 #endif
 

From 0d80ac7fb3c3a416ef1302b39ce5ec361dbf2cd9 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Sun, 18 Nov 2018 16:28:30 +0000
Subject: [PATCH 15/37] Reporting current system score plus keeping track of
 thread logistics

---
 allscale/components/localoptimizer.hpp   |  2 +-
 allscale/components/nmsimplex_bbincr.hpp |  2 +-
 allscale/components/scheduler.hpp        |  5 +++++
 allscale/dashboard.hpp                   |  2 +-
 src/components/localoptimizer.cpp        |  9 ++++++++
 src/components/scheduler_component.cpp   | 28 +++++++-----------------
 src/dashboard.cpp                        |  7 +++++-
 7 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index d708d1d..bb6b325 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -76,7 +76,7 @@ struct localoptimizer
 			srand(std::time(NULL));
 	}
 	bool isConverged();
-
+	double evaluate_score(const double objectives[]);
 	void setPolicy(searchPolicy pol)
 	{
 		optmethod_ = pol;
diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp
index 441041e..66eed17 100644
--- a/allscale/components/nmsimplex_bbincr.hpp
+++ b/allscale/components/nmsimplex_bbincr.hpp
@@ -64,7 +64,7 @@ struct optstepresult
 	double threads;
 	/* index to frequency vector for freq parameter to set for sampling*/
 	int freq_idx;
-
+	
 	/******VV: Cache stuff******/
 	double objectives[3]; // (time, energy, resource)
 	// VV: _cache_expires denotes dt (in ms) after _cache_timestamp
diff --git a/allscale/components/scheduler.hpp b/allscale/components/scheduler.hpp
index 5ff8848..706b5a9 100644
--- a/allscale/components/scheduler.hpp
+++ b/allscale/components/scheduler.hpp
@@ -72,6 +72,10 @@ namespace allscale { namespace components {
         void get_local_optimizer_weights(double *time_weight, 
                                          double *energy_weight,
                                          double *resource_weight);
+        
+        double get_last_objective_score() {
+                return last_objective_score;
+        }
     private:
 
         std::size_t get_num_numa_nodes();
@@ -101,6 +105,7 @@ namespace allscale { namespace components {
         void update_active_osthreads(std::size_t threads, int64_t delta_time);
         void update_power_consumption(std::size_t power_sample, int64_t delta_time);
 #endif
+        double last_objective_score;
         int64_t last_measure_power, last_measure_threads;
 
         void fix_allcores_frequencies(int index);
diff --git a/allscale/dashboard.hpp b/allscale/dashboard.hpp
index 73670a2..385f4f1 100644
--- a/allscale/dashboard.hpp
+++ b/allscale/dashboard.hpp
@@ -89,7 +89,7 @@ namespace allscale { namespace dashboard
 
         // current power usage / max power usage \in [0..1]
         float power = 0;
-
+        
         std::string to_json() const;
 
         template <typename Archive>
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index 8dde2fa..f0a36d4 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -24,6 +24,15 @@ namespace allscale
 {
 namespace components
 {
+
+double localoptimizer::evaluate_score(const double objectives[])
+{
+	if ( mo_initialized ) {
+		return nmd.evaluate_score(objectives, nullptr);
+	}
+
+	return -1.0;
+}
 void localoptimizer::setobjectives(double time_weight, 
 								   double energy_weight, 
 								   double resource_weight)
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 8568988..5d4d9fb 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -86,6 +86,7 @@ scheduler::scheduler(std::uint64_t rank)
 #endif
       ,
       nr_opt_steps(0),
+      last_objective_score(-1.0),
       uselopt(false)
   {
   allscale_monitor = &allscale::monitor::get();
@@ -203,10 +204,6 @@ void scheduler::init() {
 #ifdef MEASURE_
   last_measure_power = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
   last_measure_threads = last_measure_power;
-// update_active_osthreads(0);
-// #ifdef ALLSCALE_HAVE_CPUFREQ
-//   update_power_consumption(hardware_reconf::read_system_power(), 1);
-// #endif
 #endif
 
   rp_ = &hpx::resource::get_partitioner();
@@ -683,6 +680,7 @@ void scheduler::optimize_locally(work_item const& work)
             long elapsedTimeMs = t_duration_now - last_objective_measurement_timestamp_;
 
             auto dt_power = t_duration_now - last_measure_power;
+            last_measure_power = t_duration_now;
             update_power_consumption(power_sum/last_power_usage, dt_power);
 
             if (elapsedTimeMs > objective_measurement_period_ms){
@@ -698,11 +696,14 @@ void scheduler::optimize_locally(work_item const& work)
 #endif
                     current_avg_iter_time = 0.0;
                 }
-
+                double last_objectives[] = {current_avg_iter_time,power_sum/(last_power_usage*monitor_c->get_max_power()),
+                        active_threads};
                 lopt_.measureObjective(current_avg_iter_time,power_sum/(last_power_usage*monitor_c->get_max_power()),
                         active_threads);
                 last_power_usage=0;
                 power_sum=0;
+
+                last_objective_score = lopt_.evaluate_score(last_objectives);
             }
 
             elapsedTimeMs = t_duration_now - last_optimization_timestamp_;
@@ -729,7 +730,7 @@ void scheduler::optimize_locally(work_item const& work)
 
 #ifdef DEBUG_MULTIOBJECTIVE_
                     std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << " out of " << lopt_.getmaxthreads() 
-                    << " , target threads = " << act_temp.threads << ", set threads to " << active_threads << std::endl;
+                    << " , target threads = " << act_temp.threads << std::endl;
 #endif
             }
         } 
@@ -992,10 +993,6 @@ unsigned int scheduler::suspend_threads(std::size_t suspendthreads) {
   std::cout << "total active PUs: " << active_threads_ << "\n";
 #endif
 
-// #ifdef MEASURE_
-//   update_active_osthreads(active_threads_-active_threads);
-// #endif
-
   active_threads = active_threads_;
 
   growing = false;
@@ -1057,9 +1054,6 @@ unsigned int scheduler::suspend_threads(std::size_t suspendthreads) {
             )
         );
   }
-// #ifdef MEASURE_
-//   update_active_osthreads(-1 * suspend_threads.size());
-// #endif
 
   active_threads = active_threads - suspend_threads.size();
 
@@ -1178,10 +1172,6 @@ unsigned int scheduler::resume_threads(std::size_t resumethreads) {
   std::cout << "total active PUs: " << active_threads_ << "\n";
 #endif
 
-// #ifdef MEASURE_
-//   update_active_osthreads(active_threads_-active_threads);
-// #endif
-
   active_threads = active_threads_;
   // if no thread is suspended, nothing to do
   if (domain_blocked_threads == 0) {
@@ -1237,9 +1227,6 @@ unsigned int scheduler::resume_threads(std::size_t resumethreads) {
             )
         );
   }
-// #ifdef MEASURE_
-//   update_active_osthreads(resume_threads.size());
-// #endif
   active_threads = active_threads + resume_threads.size();
 #ifdef DEBUG_THREADSTATUS_
   std::cout << "[SCHEDULER|INFO]: Thread Resume - Newly Active Threads: " << active_threads
@@ -1368,6 +1355,7 @@ void scheduler::update_active_osthreads(std::size_t threads, int64_t delta_time)
 
   meas_active_threads_count += delta_time;
   meas_active_threads_sum += active_threads * delta_time;
+  std::cout <<"REGISTERING THREADS " << threads << " for " << delta_time << std::endl;
 }
 
 void scheduler::update_power_consumption(std::size_t power_sample, int64_t delta_time)
diff --git a/src/dashboard.cpp b/src/dashboard.cpp
index 99bd6fb..fcf7e8b 100644
--- a/src/dashboard.cpp
+++ b/src/dashboard.cpp
@@ -70,7 +70,6 @@ namespace allscale { namespace dashboard
         state.max_power = monitor_c->get_max_power();
         state.power = state.cur_power / state.max_power;
 #endif
-
         return state;
     }
 }}
@@ -169,9 +168,15 @@ namespace allscale { namespace dashboard
 
     float system_state::score() const
     {
+#ifdef ALLSCALE_HAVE_CPUFREQ
+        return std::exp(speed * speed_exponent) *
+                std::exp(efficiency * efficiency_exponent ) *
+                std::exp(power * power_exponent);
+#else
         return std::pow(speed, speed_exponent) *
                std::pow(efficiency, efficiency_exponent) *
                std::pow(1 - power, power_exponent);
+#endif
     }
 
     template void node_state::serialize<hpx::serialization::input_archive>(hpx::serialization::input_archive& ar, unsigned);

From 5a5b30fdd6c5238e2e3c979a7ff9b3b3ad37150d Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Mon, 19 Nov 2018 13:47:47 +0000
Subject: [PATCH 16/37] Initial prototype of new ino_nmd

---
 allscale/components/localoptimizer.hpp   |   4 +-
 allscale/components/nmsimplex_bbincr.hpp |   1 +
 allscale/components/scheduler.hpp        |   4 +
 allscale/optimizer.hpp                   |  20 +-
 src/components/nmsimplex_bbincr.cpp      |  79 ++++++
 src/components/scheduler_component.cpp   |   8 +-
 src/optimizer.cpp                        | 301 ++++++++++++++++++++++-
 src/scheduler.cpp                        |  20 +-
 8 files changed, 417 insertions(+), 20 deletions(-)

diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index bb6b325..722520c 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -66,11 +66,11 @@ struct localoptimizer
 		  frequency_param_(0),
 #endif
 		  converged_(false),
-		  convergence_threshold_(0.01),
+		  convergence_threshold_(0.005),
 		  time_weight(0.0),
 		  energy_weight(0.0),
 		  resource_weight(0.0),
-		  nmd(0.01)
+		  nmd(0.005)
 	{
 		if (optmethod_ == random)
 			srand(std::time(NULL));
diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp
index 66eed17..8ed77dc 100644
--- a/allscale/components/nmsimplex_bbincr.hpp
+++ b/allscale/components/nmsimplex_bbincr.hpp
@@ -90,6 +90,7 @@ class NelderMead
 {
 
   public:
+	NelderMead(const NelderMead &other);
 	NelderMead(double);
 	// VV: For the time being 
 	//     weights = [ W_time, W_energy/power, W_resources ]
diff --git a/allscale/components/scheduler.hpp b/allscale/components/scheduler.hpp
index 706b5a9..f728526 100644
--- a/allscale/components/scheduler.hpp
+++ b/allscale/components/scheduler.hpp
@@ -65,6 +65,10 @@ namespace allscale { namespace components {
         {
             return active_threads;
         }
+
+        std::size_t get_total_threads() const {
+                return os_thread_count;
+        }
         
         void set_local_optimizer_weights(double time_weight, 
                                          double energy_weight,
diff --git a/allscale/optimizer.hpp b/allscale/optimizer.hpp
index fc64428..7f019fa 100644
--- a/allscale/optimizer.hpp
+++ b/allscale/optimizer.hpp
@@ -11,6 +11,8 @@
 #include <hpx/lcos/future.hpp>
 #include <hpx/traits/is_bitwise_serializable.hpp>
 
+#include <allscale/components/nmsimplex_bbincr.hpp>
+
 #include <iosfwd>
 #include <vector>
 
@@ -23,6 +25,7 @@ namespace allscale {
         float avg_time_;
         unsigned long long energy_;
         std::uint64_t active_frequency_;
+        std::size_t active_cores_per_node_;
         std::size_t cores_per_node_;
 
         template <typename Archive>
@@ -33,6 +36,7 @@ namespace allscale {
             ar & avg_time_;
             ar & energy_;
             ar & active_frequency_;
+            ar & active_cores_per_node_;
             ar & cores_per_node_;
         }
     };
@@ -87,6 +91,14 @@ namespace allscale {
           , f_resource_max(other.f_resource_max)
           , f_resource_leeway(other.f_resource_leeway)
           , o_ino(std::move(o_ino))
+          // VV: Used by balance_ino_nmd
+          , nmd_initialized(other.nmd_initialized)
+          , nmd(other.nmd)
+          , nodes_min(other.nodes_min)
+          , nodes_max(other.nodes_max)
+          , threads_min(other.threads_min)
+          , threads_max(other.threads_max)
+          , previous_num_nodes(other.previous_num_nodes)
         {}
 
         bool active() const
@@ -96,6 +108,7 @@ namespace allscale {
 
         hpx::future<void> balance(bool);
         hpx::future<void> balance_ino(const std::vector<std::size_t> &old_mapping);
+        hpx::future<void> balance_ino_nmd(const std::vector<std::size_t> &old_mapping);
         hpx::future<void> decide_random_mapping(const std::vector<std::size_t> &old_mapping);
 
         bool may_rebalance();
@@ -104,7 +117,7 @@ namespace allscale {
         std::size_t u_steps_till_rebalance;
 
         void tune(std::vector<optimizer_state> const& state);
-
+        int nmd_initialized;
         std::vector<bool> active_nodes_;
         std::uint64_t active_frequency_;
 
@@ -118,9 +131,14 @@ namespace allscale {
 
         std::vector<hpx::id_type> localities_;
 
+        // VV: balance_ino and balance_global data
         float f_resource_max, f_resource_leeway;
+        std::size_t previous_num_nodes;
+        int nodes_min, nodes_max, threads_min, threads_max;
 
         components::internode_optimizer_t o_ino;
+
+        components::NelderMead nmd;
     };
 }
 
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 2b6820b..9f42307 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -26,6 +26,45 @@ namespace allscale
 namespace components
 {
 
+
+NelderMead::NelderMead(const NelderMead &other)
+{
+    EPSILON = other.EPSILON;
+    state_ = other.state_;
+    max_power_ = other.max_power_;
+    max_time_ = other.max_time_;
+    
+    cache_.insert(other.cache_.begin(), other.cache_.end());
+    warming_up_step = other.warming_up_step;
+    convergence_reevaluating = other.convergence_reevaluating;
+
+    fc = other.fc;
+    fe = other.fe;
+    vs = other.vs;
+    vg = other.vg;
+    vh = other.vh;
+
+    for (auto i=0; i<NMD_NUM_KNOBS; ++i) {
+        constraint_max[i] = other.constraint_max[i];
+        constraint_min[i] = other.constraint_min[i];
+        vr[i] = other.vr[i];
+        ve[i] = other.ve[i];
+        vm[i] = other.vm[i];
+    }
+
+    for ( auto i=0; i<NMD_NUM_OBJECTIVES; ++i )
+        opt_weights[i] = other.opt_weights[i];
+
+    for (auto i=0; i<NMD_NUM_KNOBS+1; ++i )
+    {
+        for ( auto j=0; j<NMD_NUM_KNOBS; ++j ) {
+            v[i][j] = other.v[i][j];
+            initial_configurations[i][j] = other.initial_configurations[i][j];
+        }
+    }
+}
+
+
 //NelderMead::NelderMead(double (*objfunc)(double[]),double eps){
 NelderMead::NelderMead(double eps)
 {
@@ -311,6 +350,32 @@ void NelderMead::initialize_simplex(const double weights[3],
     warming_up_step = 0;
     convergence_reevaluating = false;
     cache_.clear();
+
+    for (i=0; i<NMD_NUM_KNOBS+1; ++i) {
+        int is_ok = 1;
+        do {
+            
+            for (j=0; j<NMD_NUM_KNOBS; ++j)
+                initial_configurations[i][j] = constraint_min[j] + rand() % (int) (constraint_max[j] - constraint_min[j]+1);
+            
+            is_ok = 1;
+
+            for (auto c=0; c<i && is_ok == 1; ++c)
+            {
+                is_ok = 0;
+                for ( j=0; j<NMD_NUM_KNOBS; ++j )
+                    is_ok |= (initial_configurations[c][j] != initial_configurations[i][j]);
+            }
+
+        } while (is_ok == 0);
+
+        OUT_DEBUG(
+            std::cout << "[NelderMead|DEBUG] Random initial simplex [" << i << "]: ";
+            for ( j =0; j<NMD_NUM_KNOBS; ++j) 
+                std::cout << initial_configurations[i][j] << " ";
+            std::cout << std::endl;
+        )
+    }
 }
 
 /* FIXME: generalize */
@@ -907,6 +972,20 @@ optstepresult NelderMead::step(const double objectives[])
             std::cout << "[NelderMead|DEBUG] State = Warmup " 
                       << warming_up_step << std::endl;
         #endif
+
+        OUT_DEBUG(
+            if ( warming_up_step == 0 ) {
+                std::cout << "[NelderMead|DEBUG] Initial exploration" << std::endl;
+
+                for ( auto i =0; i<NMD_NUM_KNOBS+1; ++i ) {
+                    std::cout << "Simplex[" << i <<"]:";
+                    for ( auto j=0; j<NMD_NUM_KNOBS; ++j )
+                        std::cout << " " << initial_configurations[i][j];
+                    std::cout << std::endl;
+                }
+            }
+        )
+
         // VV: Make sure that we actually profiled what we meant to
         int profiled_threads = objectives[2];
 
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 5d4d9fb..0ef7413 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -423,10 +423,12 @@ void scheduler::init() {
   }
 #if defined(ALLSCALE_HAVE_CPUFREQ)
 else {
+    /*
     using hardware_reconf = allscale::components::util::hardware_reconf;
     auto  freqs = hardware_reconf::get_frequencies(0);
     // VV: Set maximum frequency
     fix_allcores_frequencies(freqs[freqs.size()-1]);
+    */
 }
 #endif
 }
@@ -1354,8 +1356,10 @@ void scheduler::update_active_osthreads(std::size_t threads, int64_t delta_time)
     meas_active_threads_min=threads;
 
   meas_active_threads_count += delta_time;
-  meas_active_threads_sum += active_threads * delta_time;
-  std::cout <<"REGISTERING THREADS " << threads << " for " << delta_time << std::endl;
+  meas_active_threads_sum += threads * delta_time;
+
+  std::cout <<"REGISTERING THREADS " << threads << " for " << delta_time << 
+  " current average " << (meas_active_threads_sum/meas_active_threads_count) << std::endl;
 }
 
 void scheduler::update_power_consumption(std::size_t power_sample, int64_t delta_time)
diff --git a/src/optimizer.cpp b/src/optimizer.cpp
index e919fb6..4fedd61 100644
--- a/src/optimizer.cpp
+++ b/src/optimizer.cpp
@@ -17,6 +17,7 @@
 #include <cmath>
 #include <iostream>
 #include <iomanip>
+#include <map>
 
 #include <sys/types.h>
 #include <unistd.h>
@@ -38,7 +39,7 @@ namespace allscale
         allscale::components::monitor *monitor_c = &allscale::monitor::get();
         float power_now = 100.f;
 #if defined(POWER_ESTIMATE) || defined(ALLSCALE_HAVE_CPUFREQ)
-        power_now = monitor_c->get_current_power();
+        power_now = monitor_c->get_current_power() / monitor_c->get_max_power();
 #endif
         // VV: Use power as if it were energy
         return {
@@ -47,7 +48,8 @@ namespace allscale
             my_time,
             power_now,
             float(monitor_c->get_current_freq(0)),
-            scheduler::get().get_active_threads()
+            scheduler::get().get_active_threads(),
+            scheduler::get().get_total_threads()
         };
     }
 // optimizer_state get_optimizer_state()
@@ -185,11 +187,15 @@ global_optimizer::global_optimizer()
     active_nodes_(allscale::get_num_localities(), true), tuner_(new simple_coordinate_descent(tuner_configuration{active_nodes_, allscale::monitor::get().get_current_freq(0)})),
     objective_(get_default_objective()),
     active_(true), localities_(hpx::find_all_localities()),
-    f_resource_max(-1.0f), f_resource_leeway(-1.0f)
+    f_resource_max(-1.0f), f_resource_leeway(-1.0f), 
+    nmd(0.005),
+    nmd_initialized(0),
+    nodes_min(1), nodes_max(localities_.size()), threads_min(0), threads_max(0)
 {
     char *const c_policy = std::getenv("ALLSCALE_SCHEDULING_POLICY");
+    previous_num_nodes = localities_.size();
 
-    if (c_policy && strncasecmp(c_policy, "ino", 3) == 0 )
+    if (c_policy && strcasecmp(c_policy, "ino") == 0 )
     {
         char *const c_resource_max = std::getenv("ALLSCALE_RESOURCE_MAX");
         char *const c_resource_leeway = std::getenv("ALLSCALE_RESOURCE_LEEWAY");
@@ -207,20 +213,30 @@ global_optimizer::global_optimizer()
             f_resource_max = 0.75f;
         else
             f_resource_max = atof(c_resource_max);
+        
+        nodes_min = f_resource_leeway * localities_.size();
+        nodes_max = localities_.size();
+
+        if ( nodes_min < 1 )
+            nodes_min = 1;
+    }
 
+    if ( c_policy && strcasecmp(c_policy, "ino"))
         o_ino = allscale::components::internode_optimizer_t(localities_.size(),
                                                             (double) f_resource_max,
                                                             (double) f_resource_leeway,
                                                             INO_DEFAULT_FORGET_AFTER);
+    
+    if ( c_policy && strcasecmp(c_policy, "ino_nmd")) {       
+        char *const c_threads_min = std::getenv("ALLSCALE_GINO_THREADS_MIN");
+        char *const c_threads_max = std::getenv("ALLSCALE_GINO_THREADS_MAX");
+        
+        if ( c_threads_min )
+            threads_min = atoi(c_threads_min);
+        
+        if ( c_threads_max )
+            threads_max = atoi(c_threads_max);
     }
-//     else if ( strncasecmp(c_policy, "truly_random", 12) == 0 ) {
-//         char *const c_balance_every = std::getenv("ALLSCALE_TRULY_RANDOM_BALANCE_EVERY");
-//
-//         if ( c_balance_every ) {
-//             u_balance_every = (std::size_t) atoi(c_balance_every);
-//             u_steps_till_rebalance = u_balance_every;
-//         }
-//     }
 }
 
 void global_optimizer::tune(std::vector<optimizer_state> const &state)
@@ -419,6 +435,267 @@ hpx::future<void> global_optimizer::decide_random_mapping(const std::vector<std:
         );
 }
 
+hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_t> &old_mapping)
+{
+    u_steps_till_rebalance = u_balance_every;
+    return hpx::lcos::broadcast<allscale_get_optimizer_state_action>(localities_)
+        .then(
+            [this, old_mapping](hpx::future<std::vector<optimizer_state> > future_state) {
+                std::lock_guard<mutex_type> l(mtx_);
+                
+                auto state = future_state.get();
+                float avg_time = 0;
+                float avg_energy = 0;
+                float avg_threads = 0;
+                int from_node = 0;
+
+                std::size_t num_avg_time = 0ul;
+
+                for (const auto &s:state) {
+                    if ( s.avg_time_ > 0.0) {
+                        avg_time += s.avg_time_;
+                        num_avg_time ++;
+                    }
+                    avg_energy += s.energy_;
+                    avg_threads += s.active_cores_per_node_ / (float) s.cores_per_node_;
+                    std::cout << "From " << from_node 
+                        << " t:" << s.avg_time_
+                        << " e:" << s.energy_
+                        << " h:" << s.active_cores_per_node_ / (float) s.cores_per_node_
+                        << " (" << s.active_cores_per_node_ << ", " 
+                        <<s.cores_per_node_ << std::endl;
+                    ++from_node;
+                }
+                
+                if ( num_avg_time )
+                    avg_time /= num_avg_time;
+                else
+                    avg_time = 0.0;
+
+                avg_energy /= state.size();
+                avg_threads /= state.size();
+
+                // VV: First record current state
+                double measurements[3] = {avg_time, 
+                                        avg_energy, 
+                                        avg_threads * previous_num_nodes};
+                
+                if ( nmd_initialized == 0 ) {
+                    double weights[] = {(double) objective_.speed_exponent, 
+                                        (double) objective_.efficiency_exponent,
+                                        (double) objective_.power_exponent};
+                    const double constraint_min[] = {(double) nodes_min, 
+                                                      (double) threads_min};
+                    const double constraint_max[] = {(double) nodes_max, 
+                                                    (double) threads_max};
+                    for ( auto i=0; i<2; ++i ) {
+                        std::cout << "NMD Constraints[" << i << "]: "
+                                    << constraint_min[0] << " -> " 
+                                    << constraint_max[0] << " and "
+                                    << constraint_min[1] << " -> " 
+                                    << constraint_max[1] << std::endl;
+                    }
+                    nmd.initialize_simplex(weights, 
+                                            nullptr,
+                                            constraint_min,
+                                            constraint_max);
+                    
+                    nmd_initialized = 1;
+                }
+
+                auto action = nmd.step(measurements);
+                // VV: Todo do something with the action
+                //     assume that .threads = nodes and .freq_idx = threads per node
+                int new_num_nodes = action.threads;
+                int new_threads_per_node = action.freq_idx;
+
+                if ( new_num_nodes != previous_num_nodes ) {
+                    // VV: Need to redistribute tasks to nodes.
+                    //     Try to move as few as possible tasks
+                    /* VV: Balancing algorithm:
+                        new_avg_tasks = ceil(total_tasks / new_num_nodes)
+                        node_to_tasks{} = find out which tasks each node is computing()
+                        
+                        if ( new_num_nodes < previous_nodes ) {
+                            // VV: Evenly distribute all now orphaned tasks to remaining nodes
+                            orphaned_tasks = those which were running on the now unused nodes
+                            for ( node:new_used_nodes ) {
+                                old_tasks = size(node_to_tasks[node])
+                                added_to_node = 0;
+                                while (remaining_orphaned 
+                                        && added_to_node < new_avg_tasks-old_tasks) {
+                                    orphan = orphaned.pop()
+                                    node.tasks.push_back(orphan)
+                                    added_to_node ++;
+                                }
+                            }
+                        } else if ( new_num_nodes > previous_node ) {
+                            num_need_to_move = new_avg_tasks;
+                            node_to_move = previous_nodes;
+
+                            // VV: Redistribute last tasks from overflowed nodes to new ones
+                            while ( num_need_to_move > 0 && node_to_move < new_num_nodes ) {
+                                for ( node:new_used_nodes ) {
+                                    if ( num_need_to_move == 0 ) {
+                                        if ( node_to_move < new_num_nodes) {
+                                            node_to_move ++;
+                                            num_need_to_move = new_avg_tasks;
+                                        } else {
+                                            break;
+                                        }
+                                    }
+
+                                    task = node.tasks[-1]
+                                    node_to_tasks[node_to_move].tasks.push_back(task)
+                                    num_need_to_move --
+                                }
+                            }
+                        }
+                    */
+                    auto new_avg_tasks = (std::size_t) std::ceil(old_mapping.size()/
+                                                                 (float)new_num_nodes);
+                    auto new_mapping = std::vector<std::size_t>(old_mapping.size(), 0ul);
+                    auto node_to_tasks = std::map<std::size_t, std::vector<std::size_t> >();
+                    // VV: node_to_tasks maps node id to list of tasks that it's running
+                    std::size_t task_id = 0;
+                    std::size_t num_active_nodes = std::count(active_nodes_.begin(),                                active_nodes_.end(), true);
+
+                    for (auto i=0ul; i<num_active_nodes; ++i)
+                        node_to_tasks.insert(std::make_pair(i, std::vector<std::size_t>()));
+
+                    for ( const auto &node_id:old_mapping )
+                        node_to_tasks[node_id].push_back(task_id++);
+
+
+                    std::cout << "[GLOBAL OPTIMIZER] Rebalancing (original):" << std::endl;
+
+                    for ( const auto &node: node_to_tasks ) {
+                        std::cout << "node " << node.first << ": ";
+                        for ( const auto &task:node.second)
+                            std::cout << " " << task;
+                        std::cout << std::endl;
+                    }
+
+                    // VV: Something else is setting the scheduling policy too
+                    //     try to redistribute tasks to all @previous_num_nodes
+
+                    std::cout << "[GLOBAL OPTIMIZER] Re-balancing previous nodes" << std::endl;
+
+                    auto prev_avg_tasks =
+                    (std::size_t) std::ceil(old_mapping.size() /
+                                            (float)previous_num_nodes);
+                    auto node_fewer_tasks = 1ul;
+
+                    for (auto node_id = 0ul; node_id < previous_num_nodes; ++node_id)
+                    {
+                        auto &node = node_to_tasks[node_id];
+                        while (node.size() > prev_avg_tasks)
+                        {
+                            while (node_to_tasks[node_fewer_tasks].size() >= prev_avg_tasks)
+                                if (++node_fewer_tasks == previous_num_nodes)
+                                    break;
+
+                            if (node_fewer_tasks == previous_num_nodes)
+                                break;
+
+                            auto task = node.back();
+                            node.pop_back();
+                            node_to_tasks[node_fewer_tasks].push_back(task);
+                        }
+                    }
+
+                    std::cout << "[GLOBAL OPTIMIZER] Rebalanced (still original):" << std::endl;
+
+                    for ( const auto &node: node_to_tasks ) {
+                        std::cout << "node " << node.first << ": ";
+                        for ( const auto &task:node.second)
+                            std::cout << " " << task;
+                        std::cout << std::endl;
+                    }
+
+
+                    std::cout << "[GLOBAL OPTIMIZER] Changing nodes from "
+                              << previous_num_nodes
+                              << " to " << new_num_nodes << std::endl;
+
+                    if (new_num_nodes < previous_num_nodes)
+                    {
+                        std::cout << "[GLOBAL OPTIMIZER] Decreasing nodes" << std::endl;
+                        auto lost_node = new_num_nodes;
+
+                        while (lost_node < previous_num_nodes && node_to_tasks[lost_node].size())
+                        {
+                            for (auto node_id = 0ul; node_id < new_num_nodes; ++node_id)
+                            {
+                                auto &node = node_to_tasks[node_id];
+                                auto old_tasks = node.size();
+                                for (auto new_tasks = old_tasks;
+                                     lost_node < previous_num_nodes && new_tasks < new_avg_tasks;
+                                     new_tasks++)
+                                {
+                                    // VV: Move next orphaned task to @node
+                                    while (node_to_tasks[lost_node].size() == 0)
+                                    {
+                                        if (++lost_node == previous_num_nodes)
+                                            break;
+                                    }
+
+                                    if (lost_node == previous_num_nodes)
+                                        break;
+
+                                    std::size_t task = node_to_tasks[lost_node].back();
+                                    node_to_tasks[lost_node].pop_back();
+                                    node.push_back(task);
+                                }
+                            }
+                        }
+                    }
+                    else if (new_num_nodes > previous_num_nodes)
+                    {
+                        std::cout << "[GLOBAL OPTIMIZER] Increasing nodes" << std::endl;
+                        auto new_node = previous_num_nodes - 1;
+                        for (auto node_id = 0ul; node_id < previous_num_nodes; ++node_id)
+                        {
+                            auto &node = node_to_tasks[node_id];
+                            while (node.size() > new_avg_tasks)
+                            {
+                                while (node_to_tasks[new_node].size() >= new_avg_tasks)
+                                    if (++new_node == new_num_nodes)
+                                        break;
+
+                                if (new_node == new_num_nodes)
+                                    break;
+
+                                auto task = node.back();
+                                node.pop_back();
+                                node_to_tasks[new_node].push_back(task);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        std::cout << "[GLOBAL OPTIMIZER] Did not modify mapping" << std::endl;
+                    }
+
+                    if (previous_num_nodes != new_num_nodes ){
+                        {
+                            std::cout << "[GLOBAL OPTIMIZER] Rebalancing (NEW):" << std::endl;
+
+                            for ( const auto &node: node_to_tasks ) {
+                                std::cout << "node " << node.first << ": ";
+                                for ( const auto &task:node.second)
+                                    std::cout << " " << task;
+                                std::cout << std::endl;
+                            }
+
+                        }
+                        previous_num_nodes = new_num_nodes;
+                        hpx::lcos::broadcast_apply<allscale_optimizer_update_policy_action_ino>(localities_, new_mapping);
+                    }
+                }
+            });
+}
+
 hpx::future<void> global_optimizer::balance_ino(const std::vector<std::size_t> &old_mapping)
 {
     /*VV: Compute the new ino_knobs (i.e. number of Nodes), then assign tasks to
diff --git a/src/scheduler.cpp b/src/scheduler.cpp
index d88d568..6a63a0a 100644
--- a/src/scheduler.cpp
+++ b/src/scheduler.cpp
@@ -175,6 +175,7 @@ namespace allscale
                  * ALLSCALE_RESOURCE_LEEWAY = (0.0, 1.0) // extra percentage allowed to explore
                  */
                 ino,
+                ino_nmd,
                 random,
                 truly_random
             };
@@ -194,6 +195,8 @@ namespace allscale
                         return "tuned";
                     case ino:
                         return "ino";
+                    case ino_nmd:
+                        return "ino_nmd";
                     case random:
                         return "random";
                     case truly_random:
@@ -234,6 +237,12 @@ namespace allscale
                     tree_scheduling_policy::create_uniform(allscale::get_num_localities())
                 };
             }
+            if (policy == "ino_nmd" ) {
+                return {
+                    replacable_policy::ino_nmd,
+                    tree_scheduling_policy::create_uniform(allscale::get_num_localities())
+                };
+            }
             if (policy == "truly_random")
             {
                 return {
@@ -343,8 +352,8 @@ namespace allscale
         void apply_new_mapping(const std::vector<std::size_t> &new_mapping)
         {
             std::lock_guard<mutex_type> l(mtx_);
-            policy_.policy_ = tree_scheduling_policy::from_mapping(*policy_.policy_,
-                                                                    new_mapping);
+            policy_.policy_ = 
+                tree_scheduling_policy::from_mapping(*policy_.policy_, new_mapping);
         }
 
         void toggle_node(std::size_t locality_id)
@@ -493,6 +502,11 @@ namespace allscale
                 tree_scheduling_policy const& old = static_cast<tree_scheduling_policy const&>(*policy_.policy_);
                 optimizer_.balance_ino(old.task_distribution_mapping());
             }
+            
+            if ( policy_.value_ == replacable_policy::ino_nmd) {
+                tree_scheduling_policy const& old = static_cast<tree_scheduling_policy const&>(*policy_.policy_);
+                optimizer_.balance_ino_nmd(old.task_distribution_mapping());
+            }
 
             if (policy_.value_ == replacable_policy::truly_random) {
                 tree_scheduling_policy const& old = static_cast<tree_scheduling_policy const&>(*policy_.policy_);
@@ -512,7 +526,7 @@ namespace allscale
 
         void schedule(work_item work)
         {
-            if (is_root_ && work.id().is_root() && work.id().id % 20 == 0)
+            if (is_root_ && work.id().is_root() && work.id().id % 5 == 0)
             {
                 balance();
             }

From f1ceffcb9bd6ee90da54cf38b9890a0f076935d3 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Mon, 19 Nov 2018 14:08:10 +0000
Subject: [PATCH 17/37] Take into account that nodes might have died

Added a virtual to physical node dictionary.

INO_NMD assumes that nodes are in sequential order while it is
making decision on which nodes to use but once it has made its
choices it makes sure that it uses only nodes which are working
---
 src/components/scheduler_component.cpp |  6 +++++-
 src/optimizer.cpp                      | 19 ++++++++++++++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 0ef7413..ce516b1 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -1462,7 +1462,11 @@ void scheduler::stop() {
     update_power_consumption(measurement, dt_power);
   }
 #endif
-
+  if ( meas_active_threads_count == 0 )
+    meas_active_threads_count = 1;
+  if ( meas_power_count == 0 )
+    meas_power_count = 1;
+  
   std::cout << "\n****************************************************\n" << std::flush;
   std::cout << "Measured Metrics of Application Execution:\n"
 
diff --git a/src/optimizer.cpp b/src/optimizer.cpp
index 4fedd61..2fcc698 100644
--- a/src/optimizer.cpp
+++ b/src/optimizer.cpp
@@ -586,7 +586,7 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                                             (float)previous_num_nodes);
                     auto node_fewer_tasks = 1ul;
 
-                    for (auto node_id = 0ul; node_id < previous_num_nodes; ++node_id)
+                    for (auto node_id = 0ul; node_id < num_active_nodes; ++node_id)
                     {
                         auto &node = node_to_tasks[node_id];
                         while (node.size() > prev_avg_tasks)
@@ -689,6 +689,23 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                             }
 
                         }
+                        // VV: Some of the nodes might be dead, convert the virtual name
+                        //     to the physical name
+                        auto virtual_to_physical = std::vector<std::size_t>();
+
+                        std::size_t cur_node = 0ul;
+
+                        for (const auto &physical:active_nodes_) {
+                            if ( physical ) {
+                                std::cout << "Node " << cur_node << " is alive!" << std::endl;
+                                virtual_to_physical.push_back(cur_node);
+                            }
+                            cur_node ++;
+                        }
+
+                        for (auto i = 0ul;  i< new_mapping.size(); ++i)
+                            new_mapping[i] = virtual_to_physical[new_mapping[i]];
+
                         previous_num_nodes = new_num_nodes;
                         hpx::lcos::broadcast_apply<allscale_optimizer_update_policy_action_ino>(localities_, new_mapping);
                     }

From eb457cce0e7c49d3c6c876ac343cadf0d64d24f5 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Mon, 19 Nov 2018 15:31:34 +0000
Subject: [PATCH 18/37] Generalize a bit NMD

- Though it still uses "threads" and "freq_idx" as knob names the only
  restriction is that these values must be dicrete (i.e integers)
- Expects that step() contains the actual measured values of "the knob
  set".
  - For example, the scheduler may not always manage to resume/suspend
    the number of threads that the local-optimizer suggests
- Added a maximum region that can be searched for new knob_set
  alternatives so that the spirit of the optimization process is kept
  sort of intact
---
 allscale/components/localoptimizer.hpp   |   2 +-
 allscale/components/nmsimplex_bbincr.hpp |  15 +-
 src/components/localoptimizer.cpp        |  18 ++-
 src/components/nmsimplex_bbincr.cpp      | 181 ++++++++++++++++-------
 src/components/scheduler_component.cpp   |   2 +-
 src/optimizer.cpp                        |  43 +++---
 6 files changed, 181 insertions(+), 80 deletions(-)

diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index 722520c..a26db1f 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -158,7 +158,7 @@ struct localoptimizer
 	void setmaxthreads(std::size_t threads);
 
 	/* executes one step of multi-objective optimization */
-	actuation step();
+	actuation step(std::size_t active_threads);
 
 	/* adds a measurement sample to the specified objective */
 	void measureObjective(double iter_time, double power, double threads);
diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp
index 8ed77dc..81704c3 100644
--- a/allscale/components/nmsimplex_bbincr.hpp
+++ b/allscale/components/nmsimplex_bbincr.hpp
@@ -129,7 +129,8 @@ class NelderMead
 	double evaluate_score(const double objectives[], const double *weights);
 	void set_weights(const double weights[]);
 
-	optstepresult step(const double objectives[]);
+	optstepresult step(const double objectives[], 
+			double knob1, double knob2);
 
   private:
 	int warming_up_step;
@@ -148,10 +149,14 @@ class NelderMead
 	MapCache_t cache_;
 
 	optstepresult do_step_start();
-	optstepresult do_step_reflect(const double objectives[]);
-	optstepresult do_step_expand(const double objectives[]);
-	optstepresult do_step_contract(const double objectives[]);
-	optstepresult do_step_shrink(const double objectives[]);
+	optstepresult do_step_reflect(const double objectives[], 
+			double knob1, double knob2);
+	optstepresult do_step_expand(const double objectives[], 
+			double knob1, double knob2);
+	optstepresult do_step_contract(const double objectives[], 
+			double knob1, double knob2);
+	optstepresult do_step_shrink(const double objectives[], 
+			double knob1, double knob2);
 
 	void sort_vertices(void);
 	void my_constraints(double *);
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index f0a36d4..f70f76c 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -203,9 +203,11 @@ void localoptimizer::reset_accumulated_measurements()
 	pending_num_times = 0;
 }
 
-actuation localoptimizer::step()
+actuation localoptimizer::step(std::size_t active_threads)
 {
 	actuation act;
+	// VV: Possibly amend erroneous information
+	threads_param_  = active_threads;
 	act.threads = threads_param_;
 #ifdef ALLSCALE_HAVE_CPUFREQ
 	act.frequency_idx = frequency_param_;
@@ -232,7 +234,14 @@ actuation localoptimizer::step()
 		reset_accumulated_measurements();
 
 		if ( explore_knob_domain ){
-			optstepresult nmd_res = nmd.step(latest_measurements);
+			optstepresult nmd_res = nmd.step(latest_measurements,
+											 active_threads,
+#ifdef ALLSCALE_HAVE_CPUFREQ
+											 frequency_param_
+#else
+											0
+#endif
+											 );
 
 #ifdef DEBUG_MULTIOBJECTIVE_
 			std::cout << "[LOCALOPTIMIZER|DEBUG] New Vertex to try:";
@@ -291,6 +300,11 @@ actuation localoptimizer::step()
 	else if (act.frequency_idx > frequencies_param_allowed_.size() - 1)
 		act.frequency_idx = frequencies_param_allowed_.size() - 1;
 #endif
+	
+	threads_param_ = act.threads;
+#ifdef ALLSCALE_HAVE_CPUFREQ
+	frequency_param_ = act.frequency_idx;
+#endif
 	return act;
 }
 } // namespace components
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 9f42307..7fb76b5 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -11,6 +11,8 @@
  */
 
 #include <allscale/components/nmsimplex_bbincr.hpp>
+#include <cmath>
+
 #define NMD_DEBUG_ 1
 #define NMD_INFO_ 1
 
@@ -183,6 +185,9 @@ void NelderMead::generate_new(F &gen)
     max_nested_level *=2;
     auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
 
+    // VV: Restrict search-grid to a maximum block of 5x5
+    int retries = 0;
+    const int retries_threshold = 5*5;
     int is_same;
     do
     {
@@ -197,10 +202,12 @@ void NelderMead::generate_new(F &gen)
             auto dt = timestamp_now - entry->second._cache_timestamp;
             is_same = dt <= entry->second._cache_expires_dt;
         }
-
+        
+        ++ retries;
         if ( ( level < max_level +1) 
              && is_same 
-             && max_combinations > (NMD_NUM_KNOBS + 1))
+             && max_combinations > (NMD_NUM_KNOBS + 1)
+             && retries < retries_threshold )
         {
             # if 0
             extra[0] = rand() % (int)(constraint_max[0] - constraint_min[0]) 
@@ -219,12 +226,20 @@ void NelderMead::generate_new(F &gen)
             #endif
             OUT_DEBUG(
                 std::cout << "[NelderMead|Debug] Rejecting " 
-                    << new_set[0] << " " << new_set[1] <<  std::endl;
+                    << new_set[0] << " " << new_set[1] 
+                    << " will try offset " << extra[0] << " " << extra[1] <<  std::endl;
             )
         } else {
             break;
         }
     } while ( 1 );
+
+    if ( retries >= retries_threshold ) {
+        extra[0] = 0;
+        extra[1] = 0;
+
+        gen(extra);
+    }
 }
 
 void NelderMead::my_constraints(double x[])
@@ -312,7 +327,7 @@ double NelderMead::evaluate_score(const double objectives[], const double *weigh
     #else 
     score = 1.0;
     for ( auto i=0; i<NMD_NUM_OBJECTIVES; ++ i) {
-        score *= exp(weights[i]*objectives[i]/scale[i]);
+        score *= std::exp(weights[i]*objectives[i]/scale[i]);
     }
     #endif
     return score;
@@ -611,33 +626,43 @@ optstepresult NelderMead::do_step_start()
 
         if (dt < entry->second._cache_expires_dt)
         {
-            return do_step_reflect(entry->second.objectives);
+            return do_step_reflect(entry->second.objectives,
+                    entry->second.threads,
+                    entry->second.freq_idx);
         }
     }
 
     return res;
 }
 
-optstepresult NelderMead::do_step_reflect(const double objectives[])
+optstepresult NelderMead::do_step_reflect(const double objectives[], 
+            double knob1, double knob2)
 {
     optstepresult res;
 #ifdef NMD_DEBUG_
     std::cout << "[NelderMead DEBUG] State = Reflection" << std::endl;
 #endif
     // VV: Make sure that we actually profiled what we meant to
-    int profiled_threads = objectives[2];
-
-    if ( (int) vr[0] != profiled_threads ) {
-        std::cout << "[NelderMead|WARN] Meant to profile " << vr[0] << " threads "
-                     "but ended up using " << profiled_threads << std::endl;
+    double profiled[] = {knob1, knob2};
+    my_constraints(profiled);
+
+    if ( vr[0] != profiled[0] || vr[1] != profiled[1] ) {
+        std::cout << "[NelderMead|WARN] Meant to profile " << vr[0] << " knob1 "
+                     "but ended up using " << profiled[0] << std::endl;
+        std::cout << "[NelderMead|WARN] Meant to profile " << vr[1] << " knob2 "
+                     "but ended up using " << profiled[1] << std::endl;
         
         auto key = std::make_pair((int)vr[0], (int)vr[1]);
         auto iter = cache_.find(key);
         if ( iter != cache_.end() ) {
-            iter->second.threads = profiled_threads;
+            iter->second.threads = profiled[0];
+            iter->second.freq_idx = profiled[1];
         }
 
-        vr[0] = profiled_threads;
+        vr[0] = profiled[0];
+        vr[1] = profiled[1];
+
+        cache_update((int)vr[0], (int)vr[1], objectives, true);
     }
 
     fr = evaluate_score(objectives, opt_weights);
@@ -695,7 +720,9 @@ optstepresult NelderMead::do_step_reflect(const double objectives[])
 
             if (dt < entry->second._cache_expires_dt)
             {
-                return do_step_expand(entry->second.objectives);
+                return do_step_expand(entry->second.objectives,
+                    entry->second.threads,
+                    entry->second.freq_idx);
             }
         }
 
@@ -732,7 +759,9 @@ optstepresult NelderMead::do_step_reflect(const double objectives[])
 
             if (dt < entry->second._cache_expires_dt)
             {
-                return do_step_contract(entry->second.objectives);
+                return do_step_contract(entry->second.objectives,
+                    entry->second.threads,
+                    entry->second.freq_idx);
             }
         }
 
@@ -768,7 +797,9 @@ optstepresult NelderMead::do_step_reflect(const double objectives[])
 
             if (dt < entry->second._cache_expires_dt)
             {
-                return do_step_contract(entry->second.objectives);
+                return do_step_contract(entry->second.objectives,
+                    entry->second.threads,
+                    entry->second.freq_idx);
             }
         }
 
@@ -776,27 +807,34 @@ optstepresult NelderMead::do_step_reflect(const double objectives[])
     }
 }
 
-optstepresult NelderMead::do_step_expand(const double objectives[])
+optstepresult NelderMead::do_step_expand(const double objectives[],
+    double knob1, double knob2)
 {
 #ifdef NMD_DEBUG_
     std::cout << "[NelderMead DEBUG] State = Expansion" << std::endl;
 #endif
     fe = evaluate_score(objectives, nullptr);
 
-    // VV: Make sure that we actually profiled what we meant to
-    int profiled_threads = objectives[2];
+    double profiled[] = {knob1, knob2};
+    my_constraints(profiled);
 
-    if ( (int) ve[0] != profiled_threads ) {
-        std::cout << "[NelderMead|WARN] Meant to profile " << ve[0] << " threads "
-                     "but ended up using " << profiled_threads << std::endl;
+    if ( ve[0] != profiled[0] || ve[1] != profiled[1] ) {
+        std::cout << "[NelderMead|WARN] Meant to profile expand " << ve[0] << " knob1 "
+                     "but ended up using " << profiled[0] << std::endl;
+        std::cout << "[NelderMead|WARN] Meant to profile expand " << ve[1] << " knob2 "
+                     "but ended up using " << profiled[1] << std::endl;
         
         auto key = std::make_pair((int)ve[0], (int)ve[1]);
         auto iter = cache_.find(key);
         if ( iter != cache_.end() ) {
-            iter->second.threads = profiled_threads;
+            iter->second.threads = profiled[0];
+            iter->second.freq_idx = profiled[1];
         }
 
-        ve[0] = profiled_threads;
+        ve[0] = profiled[0];
+        ve[1] = profiled[1];
+
+        cache_update((int)ve[0], (int)ve[1], objectives, true);
     }
 
     if (fe < fr)
@@ -827,7 +865,8 @@ optstepresult NelderMead::do_step_expand(const double objectives[])
     return do_step_start();
 }
 
-optstepresult NelderMead::do_step_contract(const double objectives[])
+optstepresult NelderMead::do_step_contract(const double objectives[],
+    double knob1, double knob2)
 {
     int j;
 #ifdef NMD_DEBUG_
@@ -835,20 +874,26 @@ optstepresult NelderMead::do_step_contract(const double objectives[])
 #endif
     fc = evaluate_score(objectives, nullptr);
 
-    // VV: Make sure that we actually profiled what we meant to
-    int profiled_threads = objectives[2];
+    double profiled[] = {knob1, knob2};
+    my_constraints(profiled);
 
-    if ( (int) vc[0] != profiled_threads ) {
-        std::cout << "[NelderMead|WARN] Meant to profile " << vc[0] << " threads "
-                     "but ended up using " << profiled_threads << std::endl;
+    if ( vc[0] != profiled[0] || vc[1] != profiled[1] ) {
+        std::cout << "[NelderMead|WARN] Meant to profile contract " << vc[0] << " knob1 "
+                     "but ended up using " << profiled[0] << std::endl;
+        std::cout << "[NelderMead|WARN] Meant to profile contract " << vc[1] << " knob2 "
+                     "but ended up using " << profiled[1] << std::endl;
         
         auto key = std::make_pair((int)vc[0], (int)vc[1]);
         auto iter = cache_.find(key);
         if ( iter != cache_.end() ) {
-            iter->second.threads = profiled_threads;
+            iter->second.threads = profiled[0];
+            iter->second.freq_idx = profiled[1];
         }
 
-        vc[0] = profiled_threads;
+        vc[0] = profiled[0];
+        vc[1] = profiled[1];
+
+        cache_update((int)vc[0], (int)vc[1], objectives, true);
     }
 
     if (fc <= fr)
@@ -904,7 +949,9 @@ optstepresult NelderMead::do_step_contract(const double objectives[])
 
             if (dt < entry->second._cache_expires_dt)
             {
-                return do_step_shrink(entry->second.objectives);
+                return do_step_shrink(entry->second.objectives, 
+                                        entry->second.threads, 
+                                        entry->second.freq_idx);
             }
         }
 
@@ -912,21 +959,34 @@ optstepresult NelderMead::do_step_contract(const double objectives[])
     }
 }
 
-optstepresult NelderMead::do_step_shrink(const double objectives[])
+optstepresult NelderMead::do_step_shrink(const double objectives[], 
+            double knob1, double knob2)
 {
 #ifdef NMD_DEBUG_
     std::cout << "[NelderMead|DEBUG] State = Shrink" << std::endl;
 #endif
     f[vh] = evaluate_score(objectives, nullptr);
 
-    // VV: Make sure that we actually profiled what we meant to
-    int profiled_threads = objectives[2];
+    double profiled[] = {knob1, knob2};
+    my_constraints(profiled);
 
-    if ( (int) v[vh][0] != profiled_threads ) {
-        std::cout << "[NelderMead|WARN] Meant to profile " << v[vh][0] << " threads "
-                     "but ended up using " << profiled_threads << std::endl;
+    if ( v[vh][0] != profiled[0] || v[vh][1] != profiled[1] ) {
+        std::cout << "[NelderMead|WARN] Meant to profile shrink " << v[vh][0] << " knob1 "
+                     "but ended up using " << profiled[0] << std::endl;
+        std::cout << "[NelderMead|WARN] Meant to profile shrink " << v[vh][1] << " knob2 "
+                     "but ended up using " << profiled[1] << std::endl;
         
-        v[vh][0] = profiled_threads;
+        auto key = std::make_pair((int)v[vh][0], (int)v[vh][1]);
+        auto iter = cache_.find(key);
+        if ( iter != cache_.end() ) {
+            iter->second.threads = profiled[0];
+            iter->second.freq_idx = profiled[1];
+        }
+
+        v[vh][0] = profiled[0];
+        v[vh][1] = profiled[1];
+
+        cache_update((int)v[vh][0], (int)v[vh][1], objectives, true);
     }
 
     const int threads = (int)(v[vh][0]);
@@ -937,7 +997,8 @@ optstepresult NelderMead::do_step_shrink(const double objectives[])
     return do_step_start();
 }
 
-optstepresult NelderMead::step(const double objectives[])
+optstepresult NelderMead::step(const double objectives[], 
+            double knob1, double knob2)
 {
     int i, j;
 
@@ -987,25 +1048,37 @@ optstepresult NelderMead::step(const double objectives[])
         )
 
         // VV: Make sure that we actually profiled what we meant to
-        int profiled_threads = objectives[2];
-
         if ( warming_up_step > 0 && warming_up_step <= NMD_NUM_KNOBS + 1) {
-            if ( (int) v[warming_up_step-1][0] != profiled_threads ) {
-                std::cout << "[NelderMead|WARN] Meant to profile " 
-                        << v[warming_up_step-1][0] << " threads "
-                        "but ended up using " << profiled_threads << std::endl;
-                v[warming_up_step-1][0] = profiled_threads;
+            double profiled[] = {knob1, knob2};
+            my_constraints(profiled);
+
+            if ( v[warming_up_step-1][0] != profiled[0] || v[warming_up_step-1][1] != profiled[1] ) {
+                std::cout << "[NelderMead|WARN] Meant to profile expand " << v[warming_up_step-1][0] << " knob1 "
+                            "but ended up using " << profiled[0] << std::endl;
+                std::cout << "[NelderMead|WARN] Meant to profile expand " << v[warming_up_step-1][1] << " knob2 "
+                            "but ended up using " << profiled[1] << std::endl;
+                
+                auto key = std::make_pair((int)v[warming_up_step-1][0], (int)v[warming_up_step-1][1]);
+                auto iter = cache_.find(key);
+                if ( iter != cache_.end() ) {
+                    iter->second.threads = profiled[0];
+                    iter->second.freq_idx = profiled[1];
+                }
+
+                v[warming_up_step-1][0] = profiled[0];
+                v[warming_up_step-1][1] = profiled[1];
             }
+            
             // VV: Record results of last warming up step
             f[warming_up_step-1] = evaluate_score(objectives, nullptr);
-            cache_update(profiled_threads, v[warming_up_step-1][1], 
+            cache_update(v[warming_up_step-1][0], v[warming_up_step-1][1], 
                          objectives, true);
         } 
 
         if ( warming_up_step == NMD_NUM_KNOBS + 1) {
             // VV: We need not explore the knob_set space anymore
             state_ = start;
-            return step(objectives);
+            return step(objectives, knob1, knob2);
         } else if (warming_up_step > NMD_NUM_KNOBS + 1) {
             std::cout << "[NelderMead|Warn] Unknown warmup step " << warming_up_step << std::endl;
         }
@@ -1031,16 +1104,16 @@ optstepresult NelderMead::step(const double objectives[])
         res = do_step_start();
         break;
     case reflection:
-        res = do_step_reflect(objectives);
+        res = do_step_reflect(objectives, knob1, knob2);
         break;
     case expansion:
-        res = do_step_expand(objectives);
+        res = do_step_expand(objectives, knob1, knob2);
         break;
     case contraction:
-        res = do_step_contract(objectives);
+        res = do_step_contract(objectives, knob1, knob2);
         break;
     case shrink:
-        res = do_step_shrink(objectives);
+        res = do_step_shrink(objectives, knob1, knob2);
         break;
     default:
         std::cout << "Unknown NelderMead state (" << state_ << ")" << std::endl;
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index ce516b1..836e465 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -713,7 +713,7 @@ void scheduler::optimize_locally(work_item const& work)
             if (elapsedTimeMs > optimization_period_ms || nr_opt_steps == 0){
                 last_optimization_timestamp_= t_duration_now;
                 nr_opt_steps++;
-                actuation act_temp = lopt_.step();
+                actuation act_temp = lopt_.step(active_threads);
 #ifdef DEBUG_MULTIOBJECTIVE_
                 lopt_.printverbosesteps(act_temp);
 #endif
diff --git a/src/optimizer.cpp b/src/optimizer.cpp
index 2fcc698..8567870 100644
--- a/src/optimizer.cpp
+++ b/src/optimizer.cpp
@@ -37,7 +37,7 @@ namespace allscale
             my_time = -1.f;
 
         allscale::components::monitor *monitor_c = &allscale::monitor::get();
-        float power_now = 100.f;
+        float power_now = 0.001f;
 #if defined(POWER_ESTIMATE) || defined(ALLSCALE_HAVE_CPUFREQ)
         power_now = monitor_c->get_current_power() / monitor_c->get_max_power();
 #endif
@@ -503,7 +503,9 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                     nmd_initialized = 1;
                 }
 
-                auto action = nmd.step(measurements);
+                auto action = nmd.step(measurements, 
+                                        previous_num_nodes,
+                                        avg_threads * previous_num_nodes);
                 // VV: Todo do something with the action
                 //     assume that .threads = nodes and .freq_idx = threads per node
                 int new_num_nodes = action.threads;
@@ -552,13 +554,34 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                             }
                         }
                     */
+                    // VV: Some of the nodes might be dead, convert the virtual name
+                    //     to the physical name
+                    std::size_t num_active_nodes = std::count(active_nodes_.begin(),                                active_nodes_.end(), true);
+                    auto virtual_to_physical = std::vector<std::size_t>();
+
+                    std::size_t cur_node = 0ul;
+
+                    for (const auto &physical:active_nodes_) {
+                        if ( physical ) {
+                            std::cout << "Node " << cur_node << " is alive!" << std::endl;
+                            virtual_to_physical.push_back(cur_node);
+                        }
+                        cur_node ++;
+                    }
+
+                    if ( new_num_nodes > num_active_nodes )
+                        new_num_nodes = num_active_nodes;
+                    
+                    if ( previous_num_nodes > num_active_nodes )
+                        previous_num_nodes = num_active_nodes;
+                    
                     auto new_avg_tasks = (std::size_t) std::ceil(old_mapping.size()/
                                                                  (float)new_num_nodes);
                     auto new_mapping = std::vector<std::size_t>(old_mapping.size(), 0ul);
                     auto node_to_tasks = std::map<std::size_t, std::vector<std::size_t> >();
                     // VV: node_to_tasks maps node id to list of tasks that it's running
                     std::size_t task_id = 0;
-                    std::size_t num_active_nodes = std::count(active_nodes_.begin(),                                active_nodes_.end(), true);
+                    
 
                     for (auto i=0ul; i<num_active_nodes; ++i)
                         node_to_tasks.insert(std::make_pair(i, std::vector<std::size_t>()));
@@ -689,20 +712,6 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                             }
 
                         }
-                        // VV: Some of the nodes might be dead, convert the virtual name
-                        //     to the physical name
-                        auto virtual_to_physical = std::vector<std::size_t>();
-
-                        std::size_t cur_node = 0ul;
-
-                        for (const auto &physical:active_nodes_) {
-                            if ( physical ) {
-                                std::cout << "Node " << cur_node << " is alive!" << std::endl;
-                                virtual_to_physical.push_back(cur_node);
-                            }
-                            cur_node ++;
-                        }
-
                         for (auto i = 0ul;  i< new_mapping.size(); ++i)
                             new_mapping[i] = virtual_to_physical[new_mapping[i]];
 

From de28905df5d4b18a453641a603db79b0fc40068f Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Mon, 19 Nov 2018 16:09:08 +0000
Subject: [PATCH 19/37] Improved INO_NMD and dashboard integration

---
 allscale/optimizer.hpp |  2 ++
 src/dashboard.cpp      |  7 +++++--
 src/optimizer.cpp      | 14 ++++++++++++++
 src/scheduler.cpp      |  5 +++++
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/allscale/optimizer.hpp b/allscale/optimizer.hpp
index 7f019fa..4452bea 100644
--- a/allscale/optimizer.hpp
+++ b/allscale/optimizer.hpp
@@ -110,6 +110,8 @@ namespace allscale {
         hpx::future<void> balance_ino(const std::vector<std::size_t> &old_mapping);
         hpx::future<void> balance_ino_nmd(const std::vector<std::size_t> &old_mapping);
         hpx::future<void> decide_random_mapping(const std::vector<std::size_t> &old_mapping);
+        
+        void signal_objective_changed();
 
         bool may_rebalance();
 
diff --git a/src/dashboard.cpp b/src/dashboard.cpp
index fcf7e8b..6643e1b 100644
--- a/src/dashboard.cpp
+++ b/src/dashboard.cpp
@@ -23,6 +23,9 @@
 #include <boost/asio.hpp>
 
 
+// VV: Define this to use time/energy/resources instead of speed/energy/efficiency
+#define ALTERNATIVE_SCORE 
+
 namespace allscale { namespace dashboard
 {
     node_state get_state()
@@ -57,7 +60,7 @@ namespace allscale { namespace dashboard
 
         state.productive_cycles_per_second = float(state.cur_frequency) * (1.f - state.idle_rate);  // freq to Hz
 
-#ifdef ALLSCALE_HAVE_CPUFREQ
+#if defined(ALLSCALE_HAVE_CPUFREQ) || defined(ALTERNATIVE_SCORE)
         state.speed = monitor_c->get_avg_time_last_iterations(100);
         state.efficiency = active_cores;
 #else
@@ -168,7 +171,7 @@ namespace allscale { namespace dashboard
 
     float system_state::score() const
     {
-#ifdef ALLSCALE_HAVE_CPUFREQ
+#if defined(ALLSCALE_HAVE_CPUFREQ) || defined(ALTERNATIVE_SCORE)
         return std::exp(speed * speed_exponent) *
                 std::exp(efficiency * efficiency_exponent ) *
                 std::exp(power * power_exponent);
diff --git a/src/optimizer.cpp b/src/optimizer.cpp
index 8567870..4311e4c 100644
--- a/src/optimizer.cpp
+++ b/src/optimizer.cpp
@@ -239,6 +239,20 @@ global_optimizer::global_optimizer()
     }
 }
 
+void global_optimizer::signal_objective_changed()
+{
+    const double new_weights[3] = {
+        objective_.speed_exponent,
+        objective_.power_exponent,
+        objective_.efficiency_exponent
+    };
+
+    nmd.set_weights(new_weights);
+
+    if ( nmd_initialized )
+        nmd_initialized = 0;
+}
+
 void global_optimizer::tune(std::vector<optimizer_state> const &state)
 {
     allscale::components::monitor *monitor_c = &allscale::monitor::get();
diff --git a/src/scheduler.cpp b/src/scheduler.cpp
index 6a63a0a..ab19eff 100644
--- a/src/scheduler.cpp
+++ b/src/scheduler.cpp
@@ -379,6 +379,8 @@ namespace allscale
         {
             std::lock_guard<mutex_type> l(optimizer_.mtx_);
             optimizer_.objective_.speed_exponent = exp;
+            optimizer_.signal_objective_changed();
+
             double time_weight, energy_weight, resource_weight;
 
             auto &&local_scheduler = scheduler::get();
@@ -397,6 +399,8 @@ namespace allscale
         {
             std::lock_guard<mutex_type> l(optimizer_.mtx_);
             optimizer_.objective_.efficiency_exponent = exp;
+            optimizer_.signal_objective_changed();
+
             double time_weight, energy_weight, resource_weight;
 
             auto &&local_scheduler = scheduler::get();
@@ -415,6 +419,7 @@ namespace allscale
         {
             std::lock_guard<mutex_type> l(optimizer_.mtx_);
             optimizer_.objective_.power_exponent = exp;
+            optimizer_.signal_objective_changed();
             double time_weight, energy_weight, resource_weight;
             
             auto &&local_scheduler = scheduler::get();

From d80618ba4a600801691c96e266fd711256e8b878 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Mon, 19 Nov 2018 16:53:26 +0000
Subject: [PATCH 20/37] Improved INO_NMD and dashboard integration

---
 src/dashboard.cpp |  7 +++-
 src/optimizer.cpp | 89 ++++++++++++++++++++++++++---------------------
 2 files changed, 56 insertions(+), 40 deletions(-)

diff --git a/src/dashboard.cpp b/src/dashboard.cpp
index 6643e1b..d02f98b 100644
--- a/src/dashboard.cpp
+++ b/src/dashboard.cpp
@@ -503,13 +503,18 @@ namespace allscale { namespace dashboard
                         total_efficiency += cur.efficiency;
                         cur_power += cur.cur_power;
                     }
+
                     max_power += cur.max_power;
                 }
 
                 state.speed = total_speed / client.localities_.size();
 //                 state.speed = std::pow(total_speed, 1.f/client.localities_.size());
-
+#if defined(ALLSCALE_HAVE_CPUFREQ) || defined(ALTERNATIVE_SCORE)
+                // VV: This is the number of active threads
+                state.efficiency = total_efficiency;
+#else
                 state.efficiency = total_efficiency / client.localities_.size();
+#endif
                 state.power = (max_power > 0) ? cur_power/max_power : 0;
 
                 auto exponents = scheduler::get_optimizer_exponents();
diff --git a/src/optimizer.cpp b/src/optimizer.cpp
index 4311e4c..924a9f3 100644
--- a/src/optimizer.cpp
+++ b/src/optimizer.cpp
@@ -26,6 +26,16 @@
 
 #define TRULY_RANDOM_DEBUG
 
+#define DEBUG_NMD_INO 1
+
+#ifdef DEBUG_NMD_INO
+#define OUT_DEBUG(X) X
+#else
+#define OUT_DEBUG(X) \
+    {                \
+    }
+#endif
+
 namespace allscale
 {
     optimizer_state get_optimizer_state()
@@ -472,12 +482,7 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                     }
                     avg_energy += s.energy_;
                     avg_threads += s.active_cores_per_node_ / (float) s.cores_per_node_;
-                    std::cout << "From " << from_node 
-                        << " t:" << s.avg_time_
-                        << " e:" << s.energy_
-                        << " h:" << s.active_cores_per_node_ / (float) s.cores_per_node_
-                        << " (" << s.active_cores_per_node_ << ", " 
-                        <<s.cores_per_node_ << std::endl;
+
                     ++from_node;
                 }
                 
@@ -502,13 +507,7 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                                                       (double) threads_min};
                     const double constraint_max[] = {(double) nodes_max, 
                                                     (double) threads_max};
-                    for ( auto i=0; i<2; ++i ) {
-                        std::cout << "NMD Constraints[" << i << "]: "
-                                    << constraint_min[0] << " -> " 
-                                    << constraint_max[0] << " and "
-                                    << constraint_min[1] << " -> " 
-                                    << constraint_max[1] << std::endl;
-                    }
+
                     nmd.initialize_simplex(weights, 
                                             nullptr,
                                             constraint_min,
@@ -577,7 +576,9 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
 
                     for (const auto &physical:active_nodes_) {
                         if ( physical ) {
-                            std::cout << "Node " << cur_node << " is alive!" << std::endl;
+                            OUT_DEBUG(
+                                std::cout << "[Ino_NMD] Node " << cur_node << " is alive!" << std::endl;
+                            )
                             virtual_to_physical.push_back(cur_node);
                         }
                         cur_node ++;
@@ -603,20 +604,22 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                     for ( const auto &node_id:old_mapping )
                         node_to_tasks[node_id].push_back(task_id++);
 
+                    OUT_DEBUG(
+                        std::cout << "[Ino_NMD] Rebalancing (original):" << std::endl;
 
-                    std::cout << "[GLOBAL OPTIMIZER] Rebalancing (original):" << std::endl;
-
-                    for ( const auto &node: node_to_tasks ) {
-                        std::cout << "node " << node.first << ": ";
-                        for ( const auto &task:node.second)
-                            std::cout << " " << task;
-                        std::cout << std::endl;
-                    }
+                        for ( const auto &node: node_to_tasks ) {
+                            std::cout << "node " << node.first << ": ";
+                            for ( const auto &task:node.second)
+                                std::cout << " " << task;
+                            std::cout << std::endl;
+                        }
+                    )
 
                     // VV: Something else is setting the scheduling policy too
                     //     try to redistribute tasks to all @previous_num_nodes
-
-                    std::cout << "[GLOBAL OPTIMIZER] Re-balancing previous nodes" << std::endl;
+                    OUT_DEBUG(
+                        std::cout << "[GLOBAL OPTIMIZER] Re-balancing previous nodes" << std::endl;
+                    )
 
                     auto prev_avg_tasks =
                     (std::size_t) std::ceil(old_mapping.size() /
@@ -641,23 +644,27 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                         }
                     }
 
-                    std::cout << "[GLOBAL OPTIMIZER] Rebalanced (still original):" << std::endl;
+                    OUT_DEBUG(
+                        std::cout << "[GLOBAL OPTIMIZER] Rebalanced (still original):" << std::endl;
 
-                    for ( const auto &node: node_to_tasks ) {
-                        std::cout << "node " << node.first << ": ";
-                        for ( const auto &task:node.second)
-                            std::cout << " " << task;
-                        std::cout << std::endl;
-                    }
+                        for ( const auto &node: node_to_tasks ) {
+                            std::cout << "node " << node.first << ": ";
+                            for ( const auto &task:node.second)
+                                std::cout << " " << task;
+                            std::cout << std::endl;
+                        }
 
 
-                    std::cout << "[GLOBAL OPTIMIZER] Changing nodes from "
-                              << previous_num_nodes
-                              << " to " << new_num_nodes << std::endl;
+                        std::cout << "[GLOBAL OPTIMIZER] Changing nodes from "
+                                << previous_num_nodes
+                                << " to " << new_num_nodes << std::endl;
+                    )
 
                     if (new_num_nodes < previous_num_nodes)
                     {
-                        std::cout << "[GLOBAL OPTIMIZER] Decreasing nodes" << std::endl;
+                        OUT_DEBUG(
+                            std::cout << "[GLOBAL OPTIMIZER] Decreasing nodes" << std::endl;
+                        )
                         auto lost_node = new_num_nodes;
 
                         while (lost_node < previous_num_nodes && node_to_tasks[lost_node].size())
@@ -689,7 +696,9 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                     }
                     else if (new_num_nodes > previous_num_nodes)
                     {
-                        std::cout << "[GLOBAL OPTIMIZER] Increasing nodes" << std::endl;
+                        OUT_DEBUG(
+                            std::cout << "[GLOBAL OPTIMIZER] Increasing nodes" << std::endl;
+                        )
                         auto new_node = previous_num_nodes - 1;
                         for (auto node_id = 0ul; node_id < previous_num_nodes; ++node_id)
                         {
@@ -711,11 +720,13 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                     }
                     else
                     {
-                        std::cout << "[GLOBAL OPTIMIZER] Did not modify mapping" << std::endl;
+                        OUT_DEBUG(
+                            std::cout << "[GLOBAL OPTIMIZER] Did not modify mapping" << std::endl;
+                        )
                     }
 
                     if (previous_num_nodes != new_num_nodes ){
-                        {
+                        OUT_DEBUG(
                             std::cout << "[GLOBAL OPTIMIZER] Rebalancing (NEW):" << std::endl;
 
                             for ( const auto &node: node_to_tasks ) {
@@ -724,8 +735,8 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                                     std::cout << " " << task;
                                 std::cout << std::endl;
                             }
+                        )
 
-                        }
                         for (auto i = 0ul;  i< new_mapping.size(); ++i)
                             new_mapping[i] = virtual_to_physical[new_mapping[i]];
 

From 520b3e1bcbe10c1c66388c3c064a711f56e500f9 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Tue, 20 Nov 2018 11:38:50 +0000
Subject: [PATCH 21/37] ALLSCALE_HAVE_CPUFREQ determines whether CPUFREQ is
 available

- We can set CPU governor, retrieve and set CPU frequency
---
 allscale/components/localoptimizer.hpp |  22 +-
 allscale/components/scheduler.hpp      |  10 +-
 src/components/localoptimizer.cpp      |  47 ++--
 src/components/monitor_component.cpp   |   8 +-
 src/components/scheduler_component.cpp | 359 ++++++++++---------------
 src/dashboard.cpp                      |  12 +-
 6 files changed, 170 insertions(+), 288 deletions(-)

diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index a26db1f..1a04e9d 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -44,14 +44,7 @@ enum searchPolicy
 struct actuation
 {
 	unsigned int threads;
-
-#if defined(ALLSCALE_HAVE_CPUFREQ)
-	/* index to the global cpu-supported frequencies vector pointing to
-           the new frequency to be set. If set to -1, frequency will stay
-           unchanged */
 	int frequency_idx;
-	int previous_frequency_idx;
-#endif
 };
 
 struct localoptimizer
@@ -62,9 +55,7 @@ struct localoptimizer
 		  pending_time(0.),
 		  pending_num_times(0.),
 		  mo_initialized(false),
-#if defined(ALLSCALE_HAVE_CPUFREQ)
 		  frequency_param_(0),
-#endif
 		  converged_(false),
 		  convergence_threshold_(0.005),
 		  time_weight(0.0),
@@ -87,9 +78,7 @@ struct localoptimizer
 				  << std::endl;
 #endif
 	}
-#ifdef ALLSCALE_HAVE_CPUFREQ
 	void initialize_nmd(bool from_scratch);
-#endif
 	searchPolicy getPolicy() { return optmethod_; }
 
 	// VV: Modifying the objectives triggers restarting the optimizer
@@ -113,7 +102,7 @@ struct localoptimizer
 
 	void setCurrentThreads(std::size_t threads) { threads_param_ = threads; }
 
-#if defined(ALLSCALE_HAVE_CPUFREQ)
+
 	unsigned int getCurrentFrequencyIdx()
 	{
 		return frequency_param_;
@@ -149,7 +138,7 @@ struct localoptimizer
 		//  std::cout << "***>>>> " << el << std::endl;
 		return frequencies_param_allowed_;
 	}
-#endif
+
 	std::size_t getmaxthreads()
 	{
 		return max_threads_;
@@ -226,19 +215,12 @@ struct localoptimizer
 	/* maximum number of OS threads supported by the runtime */
 	std::size_t max_threads_;
 
-#if defined(ALLSCALE_HAVE_CPUFREQ)
 	/* active optimization parameter - current CPU frequency index */
 	unsigned int frequency_param_;
 
-	/* ordered set of frequency values that the CPU has been set to by
-           the optimization algorithm. The most recent value is stored at the
-           end of the vector */
-	std::vector<unsigned long> frequency_param_values_;
-
 	/* vector containing sorted list of frequencies supported by the
            processor */
 	std::vector<unsigned long> frequencies_param_allowed_;
-#endif
 
 	/* threshold (percentage in [0,1]) to decide convergence of optimization
            steps */
diff --git a/allscale/components/scheduler.hpp b/allscale/components/scheduler.hpp
index f728526..90d32e2 100644
--- a/allscale/components/scheduler.hpp
+++ b/allscale/components/scheduler.hpp
@@ -170,18 +170,12 @@ namespace allscale { namespace components {
         unsigned long long last_power_usage;
         unsigned long long power_sum;
         unsigned long long power_count;
+
 #if defined(ALLSCALE_HAVE_CPUFREQ)
         cpufreq_policy policy;
         hardware_reconf::hw_topology topo;
-        std::vector<unsigned long> cpu_freqs;
-        // Indices correspond to the freq id in cpu_freqs, and
-        // each pair holds energy usage and execution time
-        std::vector<std::pair<unsigned long long, double>> freq_times;
-        
-        unsigned int freq_step;
-        bool target_freq_found;
 #endif
-        bool target_resource_found;
+        std::vector<unsigned long> cpu_freqs;
 
         mutable mutex_type throttle_mtx_;
         mutable mutex_type resize_mtx_;
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index f70f76c..6676738 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -41,10 +41,6 @@ void localoptimizer::setobjectives(double time_weight,
 	this->energy_weight = energy_weight;
 	this->resource_weight = resource_weight;
 
-#ifdef ALLSCALE_HAVE_CPUFREQ
-	setCurrentFrequencyIdx(0);
-#endif
-
 	// VV: Modifying the objectives triggers restarting the optimizer
 	//     from scratch
 	
@@ -56,10 +52,8 @@ void localoptimizer::reset(int threads, int freq_idx)
 {
 	threads_param_ = threads;
 	thread_param_values_.clear();
-#ifdef ALLSCALE_HAVE_CPUFREQ
+
 	frequency_param_ = freq_idx;
-	frequency_param_values_.clear();
-#endif
 	converged_ = false;
 };
 
@@ -104,14 +98,11 @@ void localoptimizer::printverbosesteps(actuation act)
 		std::cout << "Allscale ";
 	}
 	std::cout << "Scheduler Step: Setting OS Threads to " << threads_param_;
-#ifdef ALLSCALE_HAVE_CPUFREQ
+
 	if (act.frequency_idx >= 0)
 		last_frequency_idx = act.frequency_idx;
 	std::cout << " , CPU Frequency to " << frequencies_param_allowed_[last_frequency_idx]
 			  << std::endl;
-#else
-	std::cout << std::endl;
-#endif
 }
 
 void localoptimizer::accumulate_objective_measurements()
@@ -150,14 +141,20 @@ void localoptimizer::setmaxthreads(std::size_t threads)
 	#endif
 }
 
-#ifdef ALLSCALE_HAVE_CPUFREQ
+
 void localoptimizer::initialize_nmd(bool from_scratch)
 {
 	// VV: Place constraints to #threads and cpu_freq tunable knobs
 
 	double constraint_min[] = {1, 0};
+	#if defined(ALLSCALE_HAVE_CPUFREQ)
 	double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
 							   (double)frequencies_param_allowed_.size() - 1};
+	#else 
+	std::cout << "Allowed frequencies: " << frequencies_param_allowed_.size() << std::endl;
+	double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
+						       0.0};
+	#endif
 	const double opt_weights[] = { time_weight, energy_weight, resource_weight };
 
 	if( from_scratch == false ){
@@ -180,7 +177,6 @@ void localoptimizer::initialize_nmd(bool from_scratch)
 	explore_knob_domain = true;
 	converged_ = false;
 }
-#endif
 
 void localoptimizer::measureObjective(double iter_time, double power, double threads)
 {
@@ -209,18 +205,15 @@ actuation localoptimizer::step(std::size_t active_threads)
 	// VV: Possibly amend erroneous information
 	threads_param_  = active_threads;
 	act.threads = threads_param_;
-#ifdef ALLSCALE_HAVE_CPUFREQ
+
 	act.frequency_idx = frequency_param_;
-#endif
+
 	/* random optimization step */
 	if (optmethod_ == random)
 	{
 		act.threads = (rand() % max_threads_);
-#ifdef ALLSCALE_HAVE_CPUFREQ
 		act.frequency_idx = rand() % frequencies_param_allowed_.size();
-#endif
 	}
-#ifdef ALLSCALE_HAVE_CPUFREQ
 	else if (optmethod_ == allscale)
 	{
 		// VV: Keep track of dirty objectives
@@ -236,12 +229,7 @@ actuation localoptimizer::step(std::size_t active_threads)
 		if ( explore_knob_domain ){
 			optstepresult nmd_res = nmd.step(latest_measurements,
 											 active_threads,
-#ifdef ALLSCALE_HAVE_CPUFREQ
-											 frequency_param_
-#else
-											0
-#endif
-											 );
+											 frequency_param_);
 
 #ifdef DEBUG_MULTIOBJECTIVE_
 			std::cout << "[LOCALOPTIMIZER|DEBUG] New Vertex to try:";
@@ -262,6 +250,7 @@ actuation localoptimizer::step(std::size_t active_threads)
 #endif
 				act.threads = minimization_point[0];
 				act.frequency_idx = minimization_point[1];
+				
 				// VV: Stop searching for new knob_set
 				explore_knob_domain = false;
 				converged_ = true;
@@ -281,8 +270,6 @@ actuation localoptimizer::step(std::size_t active_threads)
 #endif
 		}
 	}
-#endif // ALLSCALE_HAVE_CPUFREQ
-
 validate_act:
 
 	if (act.threads > max_threads_)
@@ -293,18 +280,16 @@ actuation localoptimizer::step(std::size_t active_threads)
 	{
 		act.threads = getCurrentThreads();
 	}
-#ifdef ALLSCALE_HAVE_CPUFREQ
+
 	// VV: If freq_idx is -1 then set it to last used frequency (frequency_param_)
 	if (act.frequency_idx < 0)
 		act.frequency_idx = frequency_param_;
 	else if (act.frequency_idx > frequencies_param_allowed_.size() - 1)
 		act.frequency_idx = frequencies_param_allowed_.size() - 1;
-#endif
-	
+
 	threads_param_ = act.threads;
-#ifdef ALLSCALE_HAVE_CPUFREQ
 	frequency_param_ = act.frequency_idx;
-#endif
+
 	return act;
 }
 } // namespace components
diff --git a/src/components/monitor_component.cpp b/src/components/monitor_component.cpp
index d1817ae..5ae6463 100644
--- a/src/components/monitor_component.cpp
+++ b/src/components/monitor_component.cpp
@@ -358,7 +358,7 @@ namespace allscale { namespace components {
 
    float monitor::get_current_power()
    {
-#ifdef ALLSCALE_HAVE_CPUFREQ
+      #ifdef ALLSCALE_HAVE_CPUFREQ
       /*VV: Read potentially multiple measurements of power within the span of 
             POWER_MEASUREMENT_PERIOD_MS milliseconds. Each time this function
             is invoked it returns the running average of power.*/
@@ -388,9 +388,9 @@ namespace allscale { namespace components {
       }
 
       return ret;
-#else
+      #else
       return allscale::power::estimate_power(get_current_freq(0)) * num_cpus_;
-#endif
+      #endif
    }
 
 
@@ -406,7 +406,7 @@ namespace allscale { namespace components {
 #elif defined(POWER_ESTIMATE)
       return allscale::power::estimate_power(get_max_freq(0)) * num_cpus_;
 #else
-      return 0.0;
+      return 125.0;
 #endif
    }
 
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 836e465..76ed483 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -50,14 +50,7 @@ scheduler::scheduler(std::uint64_t rank)
       current_power_usage(0),
       last_power_usage(0),
       power_sum(0),
-      power_count(0)
-
-#if defined(ALLSCALE_HAVE_CPUFREQ)
-      ,
-      target_freq_found(false)
-#endif
-      ,
-      target_resource_found(false),
+      power_count(0),
       sampling_interval(10),
       current_avg_iter_time(0.0),
       multi_objectives(false),
@@ -103,8 +96,9 @@ scheduler::scheduler(std::uint64_t rank)
 #endif
 #ifdef ALLSCALE_HAVE_CPUFREQ
   std::cout << "ALLSCALE_HAVE_CPUFREQ is defined" << std::endl << std::flush;
+#else
+  std::cout << "ALLSCALE_HAVE_CPUFREQ is not defined. No real power measurements or CPU frequency scaling" << std::endl << std::flush;
 #endif
-
 }
 
 /**
@@ -229,15 +223,11 @@ void scheduler::init() {
 #ifdef DEBUG_MULTIOBJECTIVE_
     std::cout << "[Local Optimizer|INFO] Optimization Policy Active = " << input_optpolicy_str << std::endl;
 #endif
-#if ALLSCALE_HAVE_CPUFREQ
-    if (input_optpolicy_str=="allscale") {
+  if (input_optpolicy_str=="allscale")
 		lopt_.setPolicy(allscale);
-	}
-    else 
-#endif
-	if (input_optpolicy_str=="random")
+  else 	if (input_optpolicy_str=="random")
       lopt_.setPolicy(random);
-    else if (input_optpolicy_str=="manual")
+  else if (input_optpolicy_str=="manual")
       lopt_.setPolicy(manual);
 	else if ( input_optpolicy_str != "none" ) {
 		HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init", 
@@ -350,10 +340,9 @@ void scheduler::init() {
     executors_.emplace_back(pool_name);
   }
 
-#if defined(ALLSCALE_HAVE_CPUFREQ)
   if (multi_objectives) {
 
-#ifdef DEBUG_INIT_
+    #ifdef DEBUG_INIT_
     std::cout << "\n****************************************************\n" << std::flush;
     std::cout << "Policy selected: multi-objective set with time=" << time_weight
               << ", energy=" << energy_weight 
@@ -367,18 +356,16 @@ void scheduler::init() {
               "\tMulti-objective: " << multi_objectives <<
               "\n" << std::flush;
     std::cout << "****************************************************\n" << std::flush;
-#endif
+    #endif
   }
 
   if (energy_requested)
     initialize_cpu_frequencies();
 
-#ifdef MEASURE_MANUAL_
+  #ifdef MEASURE_MANUAL_
   if (manual_input_provided && input_objective_str.empty())
       fix_allcores_frequencies(temp_idx);
-#endif
-
-#endif
+  #endif
 
   initialized_ = true;
 #ifdef DEBUG_INIT_
@@ -398,7 +385,7 @@ void scheduler::init() {
 
     lopt_.setmaxthreads(os_thread_count);
 
- #if defined(ALLSCALE_HAVE_CPUFREQ)
+    #if defined(ALLSCALE_HAVE_CPUFREQ)
     using hardware_reconf = allscale::components::util::hardware_reconf;
     auto  freqs = hardware_reconf::get_frequencies(0);
 
@@ -409,28 +396,18 @@ void scheduler::init() {
     }
     // VV: Set to max number of threads and max frequency
     lopt_.reset(os_thread_count, freqs.size()-1);
-#else
+    #else
     // VV: Max number of threads, and an arbitrary frequency index
     lopt_.reset(os_thread_count,0);
-#endif
+    #endif
     
     // VV: Set objectives after setting all constraints to
     //     trigger the initialization of nmd
     lopt_.setobjectives(time_weight, energy_weight, resource_weight);
-#ifdef DEBUG_
+    #ifdef DEBUG_
     lopt_.printobjectives();
-#endif
+    #endif
   }
-#if defined(ALLSCALE_HAVE_CPUFREQ)
-else {
-    /*
-    using hardware_reconf = allscale::components::util::hardware_reconf;
-    auto  freqs = hardware_reconf::get_frequencies(0);
-    // VV: Set maximum frequency
-    fix_allcores_frequencies(freqs[freqs.size()-1]);
-    */
-}
-#endif
 }
 
 /**
@@ -442,16 +419,13 @@ else {
  * potential.
  *
 */
-void scheduler::initialize_cpu_frequencies() {
 #if defined(ALLSCALE_HAVE_CPUFREQ)
+void scheduler::initialize_cpu_frequencies() 
+{
   using hardware_reconf = allscale::components::util::hardware_reconf;
   cpu_freqs = hardware_reconf::get_frequencies(0);
-  freq_step = 8; // cpu_freqs.size() / 2;
-  freq_times.resize(cpu_freqs.size());
-
-#ifdef MEASURE_
-#ifdef ALLSCALE_HAVE_CPUFREQ
-#ifdef DEBUG_INIT_
+  
+  #if defined(MEASURE_) && defined(DEBUG_INIT)
   unsigned long temp_transition_latency=hardware_reconf::get_cpu_transition_latency(1);
   if (temp_transition_latency==0)
     std::cout << "[INFO] Transition Latency Unavailable" <<
@@ -460,45 +434,37 @@ void scheduler::initialize_cpu_frequencies() {
     std::cout << "[INFO] Core-1 Frequency Transition Latency = " <<
       hardware_reconf::get_cpu_transition_latency(2)/1000 <<
       " milliseconds\n" << std::flush;
-#endif
-#endif
-#endif
 
-#ifdef DEBUG_INIT_
+  #endif
+
+  #ifdef DEBUG_INIT_
   std::cout << "[INFO] Governors available on the system: " <<
       "\n" << std::flush;
-#ifdef ALLSCALE_HAVE_CPUFREQ
   std::vector<std::string> temp_governors = hardware_reconf::get_governors(0);
   for (std::vector<std::string>::const_iterator i = temp_governors.begin(); i != temp_governors.end(); ++i)
     std::cout << "[INFO]\t" << *i << "\n" << std::flush;
-#endif
   std::cout << "\n" << std::flush;
-#endif
 
-#ifdef DEBUG_INIT_
   std::cout << "Server Processor Available Frequencies (size = " << cpu_freqs.size() << ")";
   for (auto &ind : cpu_freqs) {
     std::cout << ind << " ";
   }
   std::cout << "\n" << std::flush;
-#endif
+  #endif
 
   auto min_max_freqs = std::minmax_element(cpu_freqs.begin(), cpu_freqs.end());
   min_freq = *min_max_freqs.first;
   max_freq = *min_max_freqs.second;
-
-#ifdef DEBUG_INIT_
-  std::cout << "Min freq:  " << min_freq << ", Max freq: " << max_freq << "\n"
-            << std::flush;
-#endif
   // TODO: verify that nbpus == all pus of the system, not just the online
   // ones
   size_t nbpus = topo_->get_number_of_pus();
-#ifdef DEBUG_INIT_
+
+  #ifdef DEBUG_INIT_
+  std::cout << "Min freq:  " << min_freq << ", Max freq: " << max_freq << "\n"
+            << std::flush;
   std::cout << "nbpus known to topo_:  " << nbpus << "\n" << std::flush;
-#endif
+  #endif
 
-#ifdef ALLSCALE_HAVE_CPUFREQ
   hardware_reconf::make_cpus_online(0, nbpus);
   hardware_reconf::topo_init();
   // We have to set CPU governors to userpace in order to change frequencies
@@ -509,13 +475,12 @@ void scheduler::initialize_cpu_frequencies() {
 
   topo = hardware_reconf::read_hw_topology();
   // first reinitialize to a normal setup
-  for (unsigned int cpu_id = 0; cpu_id < topo.num_logical_cores; cpu_id++) {
+  for (unsigned int cpu_id = 0; cpu_id < topo.num_logical_cores; cpu_id++){
     hardware_reconf::set_freq_policy(cpu_id, policy);
-#ifdef DEBUG_INIT_
-    std::cout << "cpu_id " << cpu_id << " back to on-demand. ret=  " << res
-              << "\n"
-              << std::flush;
-#endif
+    #ifdef DEBUG_INIT_
+    std::cout << "cpu_id " << cpu_id << " back to on-demand. ret=  " 
+              << res << std::endl;
+    #endif
   }
 
   governor = "userspace";
@@ -523,8 +488,10 @@ void scheduler::initialize_cpu_frequencies() {
   policy.min = min_freq;
   policy.max = max_freq;
 
-  for (unsigned int cpu_id = 0; cpu_id < topo.num_logical_cores;
-       cpu_id += topo.num_hw_threads) {
+  for (unsigned int cpu_id = 0; 
+       cpu_id < topo.num_logical_cores;
+       cpu_id += topo.num_hw_threads) 
+  {
     int res = hardware_reconf::set_freq_policy(cpu_id, policy);
     if (res) {
       HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init",
@@ -533,34 +500,29 @@ void scheduler::initialize_cpu_frequencies() {
 
       return;
     }
-#ifdef DEBUG_INIT_
+  #ifdef DEBUG_INIT_
     std::cout << "cpu_id " << cpu_id
               << " initial freq policy setting. ret=  " << res << "\n"
               << std::flush;
-#endif
+  #endif
   }
-#endif
-
   // Set frequency of all threads to max when we start
 
-  {
-    // set freq to all PUs used by allscale
-    for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
-      std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
-      for (std::size_t j = 0; j < thread_count; j++) {
-        std::size_t pu_num =
-            rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
+  // set freq to all PUs used by allscale
+  for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
+    std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
+    for (std::size_t j = 0; j < thread_count; j++) {
+      std::size_t pu_num =
+          rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
 
-#ifdef ALLSCALE_HAVE_CPUFREQ
-        if (!cpufreq_cpu_exists(pu_num)) {
-          hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[0]);
-#ifdef DEBUG_INIT_
-          std::cout << "Setting cpu " << pu_num << " to freq  " << cpu_freqs[0]
-                    << ", (ret= " << res << ")\n"
-                    << std::flush;
-#endif
-        }
-#endif
+
+      if (!cpufreq_cpu_exists(pu_num)) {
+        hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[0]);
+        #ifdef DEBUG_INIT_
+        std::cout << "Setting cpu " << pu_num << " to freq  " << cpu_freqs[0]
+                  << ", (ret= " << res << ")\n"
+                  << std::flush;
+        #endif
       }
     }
   }
@@ -571,35 +533,31 @@ void scheduler::initialize_cpu_frequencies() {
   std::cout << "topo.num_logical_cores: " << topo.num_logical_cores
             << " topo.num_hw_threads" << topo.num_hw_threads << "\n"
             << std::flush;
-  {
-    // check status of Pus frequency
-#ifdef ALLSCALE_HAVE_CPUFREQ
-    for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
-      unsigned long hardware_freq = 0;
-      std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
-      for (std::size_t j = 0; j < thread_count; j++) {
-        std::size_t pu_num =
-            rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
-
-        if (!cpufreq_cpu_exists(pu_num)) {
-          do {
-            hardware_freq = hardware_reconf::get_hardware_freq(pu_num);
-#ifdef DEBUG_INIT_
-            std::cout << "current freq on cpu " << pu_num << " is "
-                      << hardware_freq << " (target freq is " << cpu_freqs[0]
-                      << " )\n"
-                      << std::flush;
-
-#endif
+      // check status of Pus frequency
+    
+  for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
+    unsigned long hardware_freq = 0;
+    std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
+    for (std::size_t j = 0; j < thread_count; j++) {
+      std::size_t pu_num =
+          rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
+
+      if (!cpufreq_cpu_exists(pu_num)) {
+        do {
+          hardware_freq = hardware_reconf::get_hardware_freq(pu_num);
+        #ifdef DEBUG_INIT_
+          std::cout << "current freq on cpu " << pu_num << " is "
+                    << hardware_freq << " (target freq is " << cpu_freqs[0]
+                    << " )\n"
+                    << std::flush;
+        #endif
 
-          } while (hardware_freq != cpu_freqs[0]);
-        }
+        } while (hardware_freq != cpu_freqs[0]);
       }
     }
-#endif
   }
 
-#ifdef ALLSCALE_USE_CORE_OFFLINING
+  #ifdef ALLSCALE_USE_CORE_OFFLINING
   // offline unused cpus
   for (unsigned int cpu_id = 0; cpu_id < topo.num_logical_cores;
        cpu_id += topo.num_hw_threads) {
@@ -612,25 +570,23 @@ void scheduler::initialize_cpu_frequencies() {
     }
 
     if (!found_it) {
-#ifdef DEBUG_INIT_
+      #ifdef DEBUG_INIT_
       std::cout << " setting cpu_id " << cpu_id << " offline \n" << std::flush;
-#endif
+      #endif
 
-#ifdef ALLSCALE_HAVE_CPUFREQ
       hardware_reconf::make_cpus_offline(cpu_id, cpu_id + topo.num_hw_threads);
-#endif
     }
   }
-#endif
-
+  #endif
+}
 #else
-  // should we really abort or should we reset energy to 1 ?
-  HPX_THROW_EXCEPTION(
-      hpx::bad_request, "scheduler::init",
-      "Requesting energy objective without having compiled with cpufreq");
-#endif
+void scheduler::initialize_cpu_frequencies() 
+{
+    cpu_freqs.clear();
+    // VV: Bogus frequency
+    cpu_freqs.push_back(1000*1024);
 }
-
+#endif
 
 /**
  *
@@ -667,7 +623,6 @@ void scheduler::optimize_locally(work_item const& work)
 
 #endif
 
-#ifdef ALLSCALE_HAVE_CPUFREQ
         if (uselopt && !lopt_.isConverged()) {
             last_power_usage++;
             allscale::components::monitor *monitor_c = &allscale::monitor::get();
@@ -749,8 +704,7 @@ void scheduler::optimize_locally(work_item const& work)
           }
         }
     #endif
-#endif
-    }
+  }
 }
 
 void scheduler::set_local_optimizer_weights(double time_weight, 
@@ -1258,9 +1212,9 @@ void scheduler::fix_allcores_frequencies(int frequency_idx){
   // ones
 
   size_t nbpus = topo_->get_number_of_pus();
-#ifdef DEBUG_FREQSCALING_
+  #ifdef DEBUG_FREQSCALING_
   std::cout << "nbpus known to topo_:  " << nbpus << "\n" << std::flush;
-#endif
+  #endif
 
   hardware_reconf::make_cpus_online(0, nbpus);
   hardware_reconf::topo_init();
@@ -1281,68 +1235,69 @@ void scheduler::fix_allcores_frequencies(int frequency_idx){
                           "set cpu frequency");
       return;
     }
-#ifdef DEBUG_FREQSCALING_
+  #ifdef DEBUG_FREQSCALING_
     std::cout << "cpu_id " << cpu_id
               << " initial freq policy setting. ret=  " << res << "\n"
               << std::flush;
-#endif
+  #endif
   }
 
-
-  {
-    // set freq of all cores used to min
-    for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
-      std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
-      for (std::size_t j = 0; j < thread_count; j++) {
-        std::size_t pu_num =
-            rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
-
-        if (!cpufreq_cpu_exists(pu_num)) {
-          //int res = hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[cpu_freqs[.size()-1]]);
-          int res = hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[frequency_idx]);
-          (void)res;
-#if defined(MEASURE_MANUAL_)
-          fixed_frequency_ = cpu_freqs[frequency_idx];
-#endif
-#ifdef DEBUG_FREQSCALING_
-          //std::cout << "Setting cpu " << pu_num << " to freq  " << cpu_freqs[cpu_freqs.size()-1]
-          std::cout << "Setting cpu " << pu_num << " to freq  " << cpu_freqs[frequency_idx]
-                    << ", (ret= " << res << ")\n"
-                    << std::flush;
-#endif
-        }
+  // set freq of all cores used to min
+  for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
+    std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
+    for (std::size_t j = 0; j < thread_count; j++) {
+      std::size_t pu_num =
+          rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
+
+      if (!cpufreq_cpu_exists(pu_num)) {
+        //int res = hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[cpu_freqs[.size()-1]]);
+        int res = hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[frequency_idx]);
+        (void)res;
+        #if defined(MEASURE_MANUAL_)
+        fixed_frequency_ = cpu_freqs[frequency_idx];
+        #endif
+        #ifdef DEBUG_FREQSCALING_
+        //std::cout << "Setting cpu " << pu_num << " to freq  " << cpu_freqs[cpu_freqs.size()-1]
+        std::cout << "Setting cpu " << pu_num << " to freq  " << cpu_freqs[frequency_idx]
+                  << ", (ret= " << res << ")\n"
+                  << std::flush;
+        #endif
       }
     }
   }
 
-  {
-    // check status of Pus frequency
-    for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
-      unsigned long hardware_freq = 0;
-      std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
-      for (std::size_t j = 0; j < thread_count; j++) {
-        std::size_t pu_num =
-            rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
 
-        if (!cpufreq_cpu_exists(pu_num)) {
-          do {
-            hardware_freq = hardware_reconf::get_hardware_freq(pu_num);
-#ifdef DEBUG_FREQSCALING_
-            std::cout << "current freq on cpu " << pu_num << " is "
-                      //<< hardware_freq << " (target freq is " << cpu_freqs[cpu_freqs.size()-1]
-                      << hardware_freq << " (target freq is " << cpu_freqs[frequency_idx]
-                      << " )\n"
 
-                      << std::flush;
+  // check status of Pus frequency
+  for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
+    unsigned long hardware_freq = 0;
+    std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
+    for (std::size_t j = 0; j < thread_count; j++) {
+      std::size_t pu_num =
+          rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
 
-#endif
+      if (!cpufreq_cpu_exists(pu_num)) {
+        do {
+          hardware_freq = hardware_reconf::get_hardware_freq(pu_num);
+          #ifdef DEBUG_FREQSCALING_
+          std::cout << "current freq on cpu " << pu_num << " is "
+                    //<< hardware_freq << " (target freq is " << cpu_freqs[cpu_freqs.size()-1]
+                    << hardware_freq << " (target freq is " << cpu_freqs[frequency_idx]
+                    << " )\n"
 
-          //} while (hardware_freq != cpu_freqs[cpu_freqs.size()-1]);
-          } while (hardware_freq != cpu_freqs[frequency_idx]);
-        }
+                    << std::flush;
+            #endif
+        //} while (hardware_freq != cpu_freqs[cpu_freqs.size()-1]);
+        } while (hardware_freq != cpu_freqs[frequency_idx]);
       }
     }
   }
+  
+}
+#else
+void scheduler::fix_allcores_frequencies(int frequency_idx)
+{
+    // VV: This is a stub
 }
 #endif
 
@@ -1404,44 +1359,7 @@ void scheduler::stop() {
       ++pool_idx;
     }
   }
-
-  /*
-
-  if (energy_requested) {
-#if defined(ALLSCALE_HAVE_CPUFREQ)
-
-    for (int cpu_id = 0; cpu_id < topo.num_logical_cores;
-         cpu_id += topo.num_hw_threads) {
-      bool found_it = false;
-      for (std::size_t i = 0; i != thread_pools_.size(); i++) {
-        if (hpx::threads::test(initial_masks_[i], cpu_id))
-          found_it = true;
-      }
-
-      if (!found_it) {
-#ifdef DEBUG_
-        std::cout << " setting cpu_id " << cpu_id << " back online \n"
-                  << std::flush;
-#endif
-
-        hardware_reconf::make_cpus_online(cpu_id, cpu_id + topo.num_hw_threads);
-      }
-    }
-
-    std::string governor = "ondemand";
-    policy.governor = const_cast<char *>(governor.c_str());
-    std::cout << "Set CPU governors back to " << governor << std::endl;
-    for (int cpu_id = 0; cpu_id < topo.num_logical_cores;
-         cpu_id += topo.num_hw_threads)
-      int res = hardware_reconf::set_freq_policy(cpu_id, policy);
-#endif
-  }
-  */
-
   stopped_ = true;
-  //         work_queue_cv_.notify_all();
-  //         std::cout << "rank(" << rank_ << "): scheduled " << count_ << "\n";
-
 
   /* Output all measured metrics */
 #ifdef DEBUG_MULTIOBJECTIVE_
@@ -1454,14 +1372,13 @@ void scheduler::stop() {
   last_measure_threads = timestamp_now;
 
   update_active_osthreads(active_threads, dt_threads);
-#ifdef ALLSCALE_HAVE_CPUFREQ
-  allscale::components::monitor *monitor_c = &allscale::monitor::get();
+    allscale::components::monitor *monitor_c = &allscale::monitor::get();
 
   auto measurement = monitor_c->get_current_power();
   if ( measurement <= 10000 ) {
     update_power_consumption(measurement, dt_power);
   }
-#endif
+  
   if ( meas_active_threads_count == 0 )
     meas_active_threads_count = 1;
   if ( meas_power_count == 0 )
@@ -1469,7 +1386,6 @@ void scheduler::stop() {
   
   std::cout << "\n****************************************************\n" << std::flush;
   std::cout << "Measured Metrics of Application Execution:\n"
-
             << "\tTotal number of tasks scheduled locally (#taskslocal) = "
             << nr_tasks_scheduled << std::endl
 
@@ -1502,5 +1418,6 @@ void scheduler::stop() {
 #endif
 
 }
-}
-}
+
+} // components
+} // allscale
diff --git a/src/dashboard.cpp b/src/dashboard.cpp
index d02f98b..d45c8e0 100644
--- a/src/dashboard.cpp
+++ b/src/dashboard.cpp
@@ -60,7 +60,7 @@ namespace allscale { namespace dashboard
 
         state.productive_cycles_per_second = float(state.cur_frequency) * (1.f - state.idle_rate);  // freq to Hz
 
-#if defined(ALLSCALE_HAVE_CPUFREQ) || defined(ALTERNATIVE_SCORE)
+#if defined(ALTERNATIVE_SCORE)
         state.speed = monitor_c->get_avg_time_last_iterations(100);
         state.efficiency = active_cores;
 #else
@@ -71,8 +71,12 @@ namespace allscale { namespace dashboard
 #if defined(POWER_ESTIMATE) || defined(ALLSCALE_HAVE_CPUFREQ)
         state.cur_power = monitor_c->get_current_power();
         state.max_power = monitor_c->get_max_power();
-        state.power = state.cur_power / state.max_power;
+#else
+        state.max_power = 1.0;
+        state.cur_power = 1.0;
 #endif
+        state.power = state.cur_power / state.max_power;
+        
         return state;
     }
 }}
@@ -171,7 +175,7 @@ namespace allscale { namespace dashboard
 
     float system_state::score() const
     {
-#if defined(ALLSCALE_HAVE_CPUFREQ) || defined(ALTERNATIVE_SCORE)
+#if defined(ALTERNATIVE_SCORE)
         return std::exp(speed * speed_exponent) *
                 std::exp(efficiency * efficiency_exponent ) *
                 std::exp(power * power_exponent);
@@ -509,7 +513,7 @@ namespace allscale { namespace dashboard
 
                 state.speed = total_speed / client.localities_.size();
 //                 state.speed = std::pow(total_speed, 1.f/client.localities_.size());
-#if defined(ALLSCALE_HAVE_CPUFREQ) || defined(ALTERNATIVE_SCORE)
+#if defined(ALTERNATIVE_SCORE)
                 // VV: This is the number of active threads
                 state.efficiency = total_efficiency;
 #else

From d8346fa83128c3bc215f16c79a4271bceb3364ce Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Tue, 20 Nov 2018 14:07:00 +0000
Subject: [PATCH 22/37] Added better scaling functionality to NMD

---
 allscale/components/localoptimizer.hpp   | 23 ++-----
 allscale/components/nmsimplex_bbincr.hpp | 12 +++-
 allscale/optimizer.hpp                   |  8 ++-
 src/components/localoptimizer.cpp        | 46 ++++++++++++-
 src/components/nmsimplex_bbincr.cpp      | 82 ++++++++++++++++++------
 src/optimizer.cpp                        | 26 ++++++--
 6 files changed, 151 insertions(+), 46 deletions(-)

diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index 1a04e9d..55785fa 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -49,23 +49,7 @@ struct actuation
 
 struct localoptimizer
 {
-	localoptimizer()
-		: pending_threads(0.),
-		  pending_energy(0.),
-		  pending_time(0.),
-		  pending_num_times(0.),
-		  mo_initialized(false),
-		  frequency_param_(0),
-		  converged_(false),
-		  convergence_threshold_(0.005),
-		  time_weight(0.0),
-		  energy_weight(0.0),
-		  resource_weight(0.0),
-		  nmd(0.005)
-	{
-		if (optmethod_ == random)
-			srand(std::time(NULL));
-	}
+	localoptimizer();
 	bool isConverged();
 	double evaluate_score(const double objectives[]);
 	void setPolicy(searchPolicy pol)
@@ -98,11 +82,12 @@ struct localoptimizer
 			*resource_weight = this->resource_weight;
 	}
 
+	void set_objectives_scale(const double objectives_scale[3]);
+
 	std::size_t getCurrentThreads() { return threads_param_; }
 
 	void setCurrentThreads(std::size_t threads) { threads_param_ = threads; }
 
-
 	unsigned int getCurrentFrequencyIdx()
 	{
 		return frequency_param_;
@@ -233,6 +218,8 @@ struct localoptimizer
 
 	/* set to true if local optimizer has converged over all objectives */
 	bool converged_;
+
+	double objectives_scale[3];
 };
 } // namespace components
 } // namespace allscale
diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp
index 81704c3..8ad4422 100644
--- a/allscale/components/nmsimplex_bbincr.hpp
+++ b/allscale/components/nmsimplex_bbincr.hpp
@@ -108,6 +108,8 @@ class NelderMead
 	void print_initial_simplex();
 	void print_iteration();
 
+	void set_scale(const double scale[NMD_NUM_OBJECTIVES]);
+
 	double *getMinVertices()
 	{
 		return v[vs];
@@ -132,9 +134,12 @@ class NelderMead
 	optstepresult step(const double objectives[], 
 			double knob1, double knob2);
 
+	void invalidate_cache();
+	void reevaluate_scores();
+
   private:
 	int warming_up_step;
-
+	bool should_invalidate_cache, should_reevaluate_scores;
 	double max_power_, max_time_;
 
 	// VV: Utility to make sure that we generate new values and not something that already
@@ -148,6 +153,9 @@ class NelderMead
 	//VV: objective_type: { <threads, cpu-freq>: optstepresult }
 	MapCache_t cache_;
 
+	void do_invalidate_cache();
+	void do_reevaluate_scores();
+
 	optstepresult do_step_start();
 	optstepresult do_step_reflect(const double objectives[], 
 			double knob1, double knob2);
@@ -170,7 +178,7 @@ class NelderMead
 
 	bool convergence_reevaluating;
 	int initial_configurations[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS];
-	
+	double scale[NMD_NUM_OBJECTIVES];
 	/* vertex with smallest value */
 	int vs;
 
diff --git a/allscale/optimizer.hpp b/allscale/optimizer.hpp
index 4452bea..b2a1ce8 100644
--- a/allscale/optimizer.hpp
+++ b/allscale/optimizer.hpp
@@ -99,7 +99,11 @@ namespace allscale {
           , threads_min(other.threads_min)
           , threads_max(other.threads_max)
           , previous_num_nodes(other.previous_num_nodes)
-        {}
+        {
+            objectives_scale[0] = other.objectives_scale[0];
+            objectives_scale[1] = other.objectives_scale[1];
+            objectives_scale[2] = other.objectives_scale[2];
+        }
 
         bool active() const
         {
@@ -141,6 +145,8 @@ namespace allscale {
         components::internode_optimizer_t o_ino;
 
         components::NelderMead nmd;
+
+        double objectives_scale[3];
     };
 }
 
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index 6676738..ae026a7 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -25,6 +25,32 @@ namespace allscale
 namespace components
 {
 
+localoptimizer::localoptimizer()
+
+		: pending_threads(0.),
+		  pending_energy(0.),
+		  pending_time(0.),
+		  pending_num_times(0.),
+		  mo_initialized(false),
+		  frequency_param_(0),
+		  converged_(false),
+		  convergence_threshold_(0.005),
+		  time_weight(0.0),
+		  energy_weight(0.0),
+		  resource_weight(0.0),
+		  nmd(0.005)
+	{
+		if (optmethod_ == random)
+			srand(std::time(NULL));
+		
+		// VV: Start with 500ms as the guestimation of max iteration time
+		objectives_scale[0] = 0.5;
+		objectives_scale[1] = 1.0;
+		objectives_scale[2] = 1.0;
+
+		nmd.set_scale(objectives_scale);
+	}
+
 double localoptimizer::evaluate_score(const double objectives[])
 {
 	if ( mo_initialized ) {
@@ -120,6 +146,7 @@ void localoptimizer::setmaxthreads(std::size_t threads)
 {
 	max_threads_=threads;
 	threads_param_=threads;
+
 	#if 0
 	double threads_tick = threads / 5.;
 
@@ -157,6 +184,8 @@ void localoptimizer::initialize_nmd(bool from_scratch)
 	#endif
 	const double opt_weights[] = { time_weight, energy_weight, resource_weight };
 
+	nmd.set_scale(objectives_scale);
+
 	if( from_scratch == false ){
 		double prev_simplex[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS];
 	
@@ -178,16 +207,31 @@ void localoptimizer::initialize_nmd(bool from_scratch)
 	converged_ = false;
 }
 
+void localoptimizer::set_objectives_scale(const double objectives_scale[3]) 
+{
+	for (auto i=0ul; i<NMD_NUM_OBJECTIVES; ++i )
+		this->objectives_scale[i] = objectives_scale[i];
+	
+	nmd.set_scale(objectives_scale);
+}
+
 void localoptimizer::measureObjective(double iter_time, double power, double threads)
 {
+	// VV: iter_time has no bound, threads has bound @max_threads_
+	//     and power 1.0
+
 	std::cout << "Measuring objective: "
 			  << iter_time << " "
 			  << power << " "
 			  << threads << std::endl;
+	if ( objectives_scale[0] < iter_time ) {
+		objectives_scale[0] = iter_time * 2.0;
+		set_objectives_scale(objectives_scale);
+	}
 
 	pending_time += iter_time;
 	pending_energy += power;
-	pending_threads += threads;
+	pending_threads += threads / max_threads_;
 	pending_num_times++;
 }
 
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 7fb76b5..ee49ae1 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -33,8 +33,6 @@ NelderMead::NelderMead(const NelderMead &other)
 {
     EPSILON = other.EPSILON;
     state_ = other.state_;
-    max_power_ = other.max_power_;
-    max_time_ = other.max_time_;
     
     cache_.insert(other.cache_.begin(), other.cache_.end());
     warming_up_step = other.warming_up_step;
@@ -54,8 +52,10 @@ NelderMead::NelderMead(const NelderMead &other)
         vm[i] = other.vm[i];
     }
 
-    for ( auto i=0; i<NMD_NUM_OBJECTIVES; ++i )
+    for ( auto i=0; i<NMD_NUM_OBJECTIVES; ++i ) {
         opt_weights[i] = other.opt_weights[i];
+        scale[i] = other.scale[i];
+    }
 
     for (auto i=0; i<NMD_NUM_KNOBS+1; ++i )
     {
@@ -66,7 +66,6 @@ NelderMead::NelderMead(const NelderMead &other)
     }
 }
 
-
 //NelderMead::NelderMead(double (*objfunc)(double[]),double eps){
 NelderMead::NelderMead(double eps)
 {
@@ -78,11 +77,11 @@ NelderMead::NelderMead(double eps)
     itr = 0;
     state_ = warmup;
     
-    max_power_ = 1.0;
-    max_time_ = 30.0;
-
     warming_up_step = 0;
     convergence_reevaluating = false;
+
+    for (auto i=0ul; i<NMD_NUM_OBJECTIVES; ++i)
+        scale[i] = 1.0;
 }
 
 std::pair<int, NelderMead::direction> NelderMead::explore_next_extra(double *extra, int level, 
@@ -301,19 +300,52 @@ bool NelderMead::cache_update(int threads, int freq_idx,
     return false;
 }
 
+void NelderMead::invalidate_cache()
+{
+    should_invalidate_cache = true;
+}
+
+void NelderMead::reevaluate_scores()
+{
+    should_reevaluate_scores = true;
+}
+
+void NelderMead::do_invalidate_cache()
+{
+    cache_.clear();
+    should_invalidate_cache = false;
+}
+
+void NelderMead::do_reevaluate_scores()
+{
+    auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    for (auto i=0ul; i<NMD_NUM_KNOBS+1; ++i )
+    {
+        auto key = std::make_pair( (int)v[i][0], (int)v[i][1] );
+        auto entry = cache_.find(key);
+
+        if ( entry != cache_.end() ) {
+            f[i] = evaluate_score(entry->second.objectives, opt_weights);
+        }
+    }
+
+    should_reevaluate_scores = false;
+}
+
+void NelderMead::set_scale(const double scale[NMD_NUM_OBJECTIVES])
+{
+    for ( auto i=0ul; i<NMD_NUM_OBJECTIVES; ++i )
+        this->scale[i] = scale[i];
+    
+    reevaluate_scores();
+}
+
 double NelderMead::evaluate_score(const double objectives[], const double *weights)
 {
     double score;
     // VV: [time, energy/power, resources]
-    double scale[] = {1.0, 1.0, 1.0};
     
-    // max_time_ = max_time_ > objectives[0] ? max_time_ : objectives[0];
-    // max_power_ = max_power_ > objectives[2] ? max_power_ : objectives[2];
-
-    scale[0] = max_time_;
-    scale[1] = max_power_;
-    scale[2] = (double)constraint_max[0];
-
     if (weights == nullptr)
         weights = opt_weights;
 
@@ -476,8 +508,9 @@ void NelderMead::initialize_simplex(const double weights[3],
 void NelderMead::print_initial_simplex()
 {
     int i, j;
-    std::cout << "[NelderMead DEBUG] Initial Values\n";
-    
+    std::cout << "[NelderMead DEBUG] Initial Values (Order indices:" 
+        << vs << ", " << vh << ", " << vg << ")" << std::endl;
+
     for (j = 0; j < NMD_NUM_KNOBS + 1; j++)
     {
         
@@ -1005,15 +1038,19 @@ optstepresult NelderMead::step(const double objectives[],
     optstepresult res;
     res.threads = 0;
     res.freq_idx = -1;
+
     OUT_DEBUG(
+        auto score = evaluate_score(objectives, nullptr);
+        
         std::cout << "[NelderMead|DEBUG] Starting step with "
             << objectives[0] << " " 
             << objectives[1] << " " 
-            << objectives[2] << std::endl;
+            << objectives[2] << " score " << score << std::endl;
     )
     
     std::size_t tested_combinations = cache_.size();
-
+    
+    #if 0
     evaluate_score(objectives, nullptr);
 
     for (i=0; i<NMD_NUM_KNOBS+1; ++i) {
@@ -1024,7 +1061,14 @@ optstepresult NelderMead::step(const double objectives[],
             f[i] = evaluate_score(entry->second.objectives, nullptr);
         }
     }
+    #endif
 
+    if ( should_invalidate_cache )
+        do_invalidate_cache();
+    
+    if ( should_reevaluate_scores )
+        do_reevaluate_scores();
+    
     switch (state_)
     {
     case warmup:
diff --git a/src/optimizer.cpp b/src/optimizer.cpp
index 924a9f3..b11ffbb 100644
--- a/src/optimizer.cpp
+++ b/src/optimizer.cpp
@@ -247,6 +247,11 @@ global_optimizer::global_optimizer()
         if ( c_threads_max )
             threads_max = atoi(c_threads_max);
     }
+
+    // VV: Guestimate that max iter time is 500 ms (will be refined over time)
+    objectives_scale[0] = 0.5;
+    objectives_scale[1] = 1.0;
+    objectives_scale[2] = 1.0;
 }
 
 void global_optimizer::signal_objective_changed()
@@ -466,6 +471,7 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
         .then(
             [this, old_mapping](hpx::future<std::vector<optimizer_state> > future_state) {
                 std::lock_guard<mutex_type> l(mtx_);
+                std::size_t num_active_nodes = std::count(active_nodes_.begin(),                                active_nodes_.end(), true);
                 
                 auto state = future_state.get();
                 float avg_time = 0;
@@ -476,10 +482,15 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                 std::size_t num_avg_time = 0ul;
 
                 for (const auto &s:state) {
+                    // VV: Only keep track of nodes that were selected by last step
+                    if ( from_node++ == previous_num_nodes )
+                        break;
+
                     if ( s.avg_time_ > 0.0) {
                         avg_time += s.avg_time_;
                         num_avg_time ++;
                     }
+
                     avg_energy += s.energy_;
                     avg_threads += s.active_cores_per_node_ / (float) s.cores_per_node_;
 
@@ -491,14 +502,19 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                 else
                     avg_time = 0.0;
 
-                avg_energy /= state.size();
-                avg_threads /= state.size();
+                avg_energy /= num_active_nodes;
+                avg_threads /= num_active_nodes;
 
                 // VV: First record current state
                 double measurements[3] = {avg_time, 
-                                        avg_energy, 
-                                        avg_threads * previous_num_nodes};
+                                          avg_energy, 
+                                          avg_threads};
                 
+                if ( objectives_scale[0] < avg_time ) {
+                    objectives_scale[0] = avg_time * 2.0;
+                    nmd.set_scale(objectives_scale);
+                }
+
                 if ( nmd_initialized == 0 ) {
                     double weights[] = {(double) objective_.speed_exponent, 
                                         (double) objective_.efficiency_exponent,
@@ -507,6 +523,7 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                                                       (double) threads_min};
                     const double constraint_max[] = {(double) nodes_max, 
                                                     (double) threads_max};
+                    nmd.set_scale(objectives_scale);
 
                     nmd.initialize_simplex(weights, 
                                             nullptr,
@@ -569,7 +586,6 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                     */
                     // VV: Some of the nodes might be dead, convert the virtual name
                     //     to the physical name
-                    std::size_t num_active_nodes = std::count(active_nodes_.begin(),                                active_nodes_.end(), true);
                     auto virtual_to_physical = std::vector<std::size_t>();
 
                     std::size_t cur_node = 0ul;

From 392b748be9a19129ccd11a14402794efa469e051 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Tue, 20 Nov 2018 14:45:14 +0000
Subject: [PATCH 23/37] Working towards integrating global and local scheduler

---
 allscale/components/nmsimplex_bbincr.hpp |  3 +-
 allscale/components/scheduler.hpp        |  2 +
 allscale/dashboard.hpp                   |  2 +
 allscale/optimizer.hpp                   |  7 +++-
 src/components/localoptimizer.cpp        |  6 ++-
 src/components/nmsimplex_bbincr.cpp      | 48 +++++++++++++++++++++++-
 src/components/scheduler_component.cpp   |  6 ++-
 src/dashboard.cpp                        | 22 +++++++++--
 src/optimizer.cpp                        | 18 ++++++++-
 src/scheduler.cpp                        | 28 ++++++++++++++
 10 files changed, 131 insertions(+), 11 deletions(-)

diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp
index 8ad4422..f630b23 100644
--- a/allscale/components/nmsimplex_bbincr.hpp
+++ b/allscale/components/nmsimplex_bbincr.hpp
@@ -100,10 +100,11 @@ class NelderMead
 							const double initial_simplex[][NMD_NUM_KNOBS],
 							const double constraint_min[NMD_NUM_KNOBS],
 							const double constraint_max[NMD_NUM_KNOBS]);
-	
+	/*
 	void initialize_simplex(const double weights[NMD_NUM_OBJECTIVES],
 							const double constraint_min[NMD_NUM_KNOBS],
 							const double constraint_max[NMD_NUM_KNOBS]);
+	*/
 
 	void print_initial_simplex();
 	void print_iteration();
diff --git a/allscale/components/scheduler.hpp b/allscale/components/scheduler.hpp
index 90d32e2..1ce0336 100644
--- a/allscale/components/scheduler.hpp
+++ b/allscale/components/scheduler.hpp
@@ -46,6 +46,8 @@ namespace allscale { namespace components {
             HPX_ASSERT(false);
         }
 
+        bool get_optimization_score();
+
         scheduler(std::uint64_t rank);
         void init();
 
diff --git a/allscale/dashboard.hpp b/allscale/dashboard.hpp
index 385f4f1..eb77398 100644
--- a/allscale/dashboard.hpp
+++ b/allscale/dashboard.hpp
@@ -91,6 +91,8 @@ namespace allscale { namespace dashboard
         float power = 0;
         
         std::string to_json() const;
+        
+        float last_local_score;
 
         template <typename Archive>
         void serialize(Archive& ar, unsigned);
diff --git a/allscale/optimizer.hpp b/allscale/optimizer.hpp
index b2a1ce8..a255497 100644
--- a/allscale/optimizer.hpp
+++ b/allscale/optimizer.hpp
@@ -99,6 +99,8 @@ namespace allscale {
           , threads_min(other.threads_min)
           , threads_max(other.threads_max)
           , previous_num_nodes(other.previous_num_nodes)
+          , use_lopt(other.use_lopt)
+          , last_optimization_score(other.last_optimization_score)
         {
             objectives_scale[0] = other.objectives_scale[0];
             objectives_scale[1] = other.objectives_scale[1];
@@ -110,6 +112,8 @@ namespace allscale {
             return active_;
         }
 
+        double get_optimization_score();
+
         hpx::future<void> balance(bool);
         hpx::future<void> balance_ino(const std::vector<std::size_t> &old_mapping);
         hpx::future<void> balance_ino_nmd(const std::vector<std::size_t> &old_mapping);
@@ -145,8 +149,9 @@ namespace allscale {
         components::internode_optimizer_t o_ino;
 
         components::NelderMead nmd;
-
+        double last_optimization_score;
         double objectives_scale[3];
+        bool use_lopt;
     };
 }
 
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index ae026a7..97f6dbd 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -172,7 +172,11 @@ void localoptimizer::setmaxthreads(std::size_t threads)
 void localoptimizer::initialize_nmd(bool from_scratch)
 {
 	// VV: Place constraints to #threads and cpu_freq tunable knobs
+	int min_threads = 0.25 * max_threads_/((double)threads_dt);
 
+	if ( min_threads < 1 )
+		min_threads = 1;
+	
 	double constraint_min[] = {1, 0};
 	#if defined(ALLSCALE_HAVE_CPUFREQ)
 	double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
@@ -225,7 +229,7 @@ void localoptimizer::measureObjective(double iter_time, double power, double thr
 			  << power << " "
 			  << threads << std::endl;
 	if ( objectives_scale[0] < iter_time ) {
-		objectives_scale[0] = iter_time * 2.0;
+		objectives_scale[0] = iter_time * 1.1;
 		set_objectives_scale(objectives_scale);
 	}
 
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index ee49ae1..be46a41 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -331,6 +331,9 @@ void NelderMead::do_reevaluate_scores()
     }
 
     should_reevaluate_scores = false;
+
+    sort_vertices();
+    centroid();
 }
 
 void NelderMead::set_scale(const double scale[NMD_NUM_OBJECTIVES])
@@ -377,7 +380,7 @@ void NelderMead::set_weights(const double weights[3])
                 << opt_weights[2] << std::endl;
     )
 }
-
+#if 0
 void NelderMead::initialize_simplex(const double weights[3],
                                     const double constraint_min[2],
                                     const double constraint_max[2])
@@ -391,6 +394,14 @@ void NelderMead::initialize_simplex(const double weights[3],
         this->constraint_max[i] = constraint_max[i];
     }
 
+    OUT_DEBUG(
+        std::cout << "[NelderMead|Debug] Initialize contraints " << std::endl;
+        std::cout << constraint_min[0] 
+                    << ":" << constraint_max[0] << std::endl;
+        std::cout << constraint_min[1] 
+                    << ":" << constraint_max[1] << std::endl;
+    )
+
     set_weights(weights);
     state_ = warmup;
     itr = 0;
@@ -424,6 +435,7 @@ void NelderMead::initialize_simplex(const double weights[3],
         )
     }
 }
+#endif
 
 /* FIXME: generalize */
 void NelderMead::initialize_simplex(const double weights[3],
@@ -440,6 +452,14 @@ void NelderMead::initialize_simplex(const double weights[3],
         this->constraint_max[i] = constraint_max[i];
     }
 
+    OUT_DEBUG(
+        std::cout << "[NelderMead|Debug] Initialize contraints " << std::endl;
+        std::cout << constraint_min[0] 
+                    << ":" << constraint_max[0] << std::endl;
+        std::cout << constraint_min[1] 
+                    << ":" << constraint_max[1] << std::endl;
+    )
+    
     set_weights(weights);
     state_ = warmup;
     itr = 0;
@@ -580,6 +600,11 @@ void NelderMead::centroid()
         }
         vm[j] = cent / n;
     }
+
+    OUT_DEBUG (
+        std::cout << "[NelderMead|DEBUG] New Centroid: " 
+        << vm[0] << " " << vm[1] << std::endl;
+    )
 }
 
 void NelderMead::sort_vertices()
@@ -1041,7 +1066,7 @@ optstepresult NelderMead::step(const double objectives[],
 
     OUT_DEBUG(
         auto score = evaluate_score(objectives, nullptr);
-        
+
         std::cout << "[NelderMead|DEBUG] Starting step with "
             << objectives[0] << " " 
             << objectives[1] << " " 
@@ -1253,6 +1278,25 @@ bool NelderMead::testConvergence(std::size_t tested_combinations)
     }
 
     if ( ret == true && convergence_reevaluating == true ) {
+        // VV: Now find the best result from cache
+        sort_vertices();
+
+        double best_knobs[NMD_NUM_KNOBS] = { v[vs][0], v[vs][1]};
+        double best_score = f[vs];
+
+        for ( const auto & entry: cache_ ) {
+            auto cur_score = evaluate_score(entry.second.objectives, nullptr);
+            if ( cur_score < best_score) {
+                best_knobs[0] = entry.second.threads;
+                best_knobs[1] = entry.second.freq_idx;
+
+                best_score = cur_score;
+            }
+        }
+
+        v[vs][0] = best_knobs[0];
+        v[vs][1] = best_knobs[1];
+        f[vs] = best_score;
         return true;
     } else if ( ret == true ) {
         // VV: Do another final run to make sure that the objective scores still hold up
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 76ed483..13cc0d7 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -51,7 +51,7 @@ scheduler::scheduler(std::uint64_t rank)
       last_power_usage(0),
       power_sum(0),
       power_count(0),
-      sampling_interval(10),
+      sampling_interval(3),
       current_avg_iter_time(0.0),
       multi_objectives(false),
       time_requested(false),
@@ -661,6 +661,10 @@ void scheduler::optimize_locally(work_item const& work)
                 power_sum=0;
 
                 last_objective_score = lopt_.evaluate_score(last_objectives);
+
+                auto power_dt = t_duration_now - last_measure_power;
+                update_power_consumption(power_sum/last_power_usage, power_dt);
+                last_measure_power = t_duration_now;
             }
 
             elapsedTimeMs = t_duration_now - last_optimization_timestamp_;
diff --git a/src/dashboard.cpp b/src/dashboard.cpp
index d45c8e0..ae5320b 100644
--- a/src/dashboard.cpp
+++ b/src/dashboard.cpp
@@ -57,7 +57,7 @@ namespace allscale { namespace dashboard
         state.max_frequency = monitor_c->get_max_freq(0);
 
         std::size_t active_cores = scheduler::get().get_active_threads();
-
+        state.last_local_score = scheduler::get().get_last_objective_score();
         state.productive_cycles_per_second = float(state.cur_frequency) * (1.f - state.idle_rate);  // freq to Hz
 
 #if defined(ALTERNATIVE_SCORE)
@@ -76,7 +76,7 @@ namespace allscale { namespace dashboard
         state.cur_power = 1.0;
 #endif
         state.power = state.cur_power / state.max_power;
-        
+
         return state;
     }
 }}
@@ -110,6 +110,7 @@ namespace allscale { namespace dashboard
         ar & speed;
         ar & efficiency;
         ar & power;
+        ar & last_local_score;
     }
 
     std::string node_state::to_json() const
@@ -225,7 +226,19 @@ namespace allscale { namespace dashboard
 
             const char* host_env = std::getenv(ENVVAR_DASHBOARD_IP);
             const char* port_env = std::getenv(ENVVAR_DASHBOARD_PORT);
+            char *const c_policy = std::getenv("ALLSCALE_SCHEDULING_POLICY");
+            std::string input_objective_str = hpx::get_config_entry("allscale.objective", "");
 
+            if (c_policy && strcasecmp(c_policy, "ino") == 0 )
+                use_gopt = true;
+            else
+                use_gopt = false;
+            
+            if ( input_objective_str == "allscale" )
+                use_lopt = true;
+            else
+                use_lopt = false;
+            
             std::string host;
             if (host_env)
             {
@@ -315,11 +328,11 @@ namespace allscale { namespace dashboard
             buffers[0] = boost::asio::buffer(&m->msg_size, sizeof(std::uint64_t));
             buffers[1] = boost::asio::buffer(m->json.data(), m->json.length());
 
-/*
+            /*
              std::cout << "Sending -----------------------------------\n";
              std::cout << m->json << '\n';
              std::cout << "Sending done ------------------------------\n";
-*/
+            */
             boost::asio::async_write(socket_, buffers,
                 [f = std::move(f), m](boost::system::error_code ec, std::size_t /*length*/)
                 {
@@ -448,6 +461,7 @@ namespace allscale { namespace dashboard
         std::vector<hpx::id_type> localities_;
         std::uint64_t time = 0;
         bool enabled_;
+        double use_gopt, use_lopt;
     };
 
     dashboard_client& dashboard_client::get()
diff --git a/src/optimizer.cpp b/src/optimizer.cpp
index b11ffbb..1733351 100644
--- a/src/optimizer.cpp
+++ b/src/optimizer.cpp
@@ -200,9 +200,17 @@ global_optimizer::global_optimizer()
     f_resource_max(-1.0f), f_resource_leeway(-1.0f), 
     nmd(0.005),
     nmd_initialized(0),
-    nodes_min(1), nodes_max(localities_.size()), threads_min(0), threads_max(0)
+    nodes_min(1), nodes_max(localities_.size()), threads_min(0), threads_max(0),
+    last_optimization_score(1.0)
 {
     char *const c_policy = std::getenv("ALLSCALE_SCHEDULING_POLICY");
+    std::string input_objective_str =
+      hpx::get_config_entry("allscale.objective", "");
+    
+    if ( input_objective_str == "allscale" )
+        use_lopt = true;
+    else
+        use_lopt = false;
     previous_num_nodes = localities_.size();
 
     if (c_policy && strcasecmp(c_policy, "ino") == 0 )
@@ -254,6 +262,11 @@ global_optimizer::global_optimizer()
     objectives_scale[2] = 1.0;
 }
 
+double global_optimizer::get_optimization_score()
+{
+    return last_optimization_score;
+}
+
 void global_optimizer::signal_objective_changed()
 {
     const double new_weights[3] = {
@@ -536,6 +549,9 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                 auto action = nmd.step(measurements, 
                                         previous_num_nodes,
                                         avg_threads * previous_num_nodes);
+                
+                last_optimization_score = nmd.evaluate_score(measurements, nullptr);
+
                 // VV: Todo do something with the action
                 //     assume that .threads = nodes and .freq_idx = threads per node
                 int new_num_nodes = action.threads;
diff --git a/src/scheduler.cpp b/src/scheduler.cpp
index ab19eff..fcb70c4 100644
--- a/src/scheduler.cpp
+++ b/src/scheduler.cpp
@@ -375,6 +375,17 @@ namespace allscale
             }
         }
 
+        double get_local_objective() {
+            auto &&local_scheduler = scheduler::get();
+            return local_scheduler.get_last_objective_score();
+        }
+
+        double get_last_objective_score()
+        {
+            auto &&local_scheduler = scheduler::get();
+            return local_scheduler.get_last_objective_score();    
+        }
+
         void set_speed_exponent(float exp)
         {
             std::lock_guard<mutex_type> l(optimizer_.mtx_);
@@ -736,6 +747,23 @@ namespace allscale
         );
     }
 
+    double get_last_objective_score()
+    {
+        std::vector<double> scores;
+
+        runtime::HierarchicalOverlayNetwork::forAllLocal<scheduler_service>(
+            [&](scheduler_service& sched)
+            {
+                scores.push_back(sched.get_last_objective_score());
+            }
+        );
+
+        std::cout << "GET_LAST_OBJETIVE_SCORE (SCHED): got " << scores.size() << " values" << std::endl;
+        for (const auto &score: scores ) {
+            std::cout << score  << std::endl;
+        }
+    }
+
     void set_efficiency_exponent_broadcast(float exp)
     {
         runtime::HierarchicalOverlayNetwork::forAllLocal<scheduler_service>(

From 25ba362779c7a92b11ee7905b7b5ad1cf50d0338 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Tue, 20 Nov 2018 17:33:24 +0000
Subject: [PATCH 24/37] Finalizing integration

---
 allscale/components/localoptimizer.hpp   |  2 +-
 allscale/components/nmsimplex_bbincr.hpp |  7 +++
 allscale/components/scheduler.hpp        |  2 +
 allscale/scheduler.hpp                   |  1 +
 src/components/localoptimizer.cpp        | 63 +++++++++++++++++++++---
 src/components/nmsimplex_bbincr.cpp      | 31 +++++++++---
 src/components/scheduler_component.cpp   | 28 +++++++++++
 src/dashboard.cpp                        | 12 -----
 src/optimizer.cpp                        |  7 ++-
 src/scheduler.cpp                        | 34 ++++++++++++-
 10 files changed, 158 insertions(+), 29 deletions(-)

diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index 55785fa..c5e6afc 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -64,7 +64,7 @@ struct localoptimizer
 	}
 	void initialize_nmd(bool from_scratch);
 	searchPolicy getPolicy() { return optmethod_; }
-
+	
 	// VV: Modifying the objectives triggers restarting the optimizer
 	void setobjectives(double time_weight, 
 						double energy_weight, 
diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp
index f630b23..974a0c1 100644
--- a/allscale/components/nmsimplex_bbincr.hpp
+++ b/allscale/components/nmsimplex_bbincr.hpp
@@ -138,6 +138,9 @@ class NelderMead
 	void invalidate_cache();
 	void reevaluate_scores();
 
+	void update_constraints(const double constraint_min[NMD_NUM_KNOBS],
+							const double constraint_max[NMD_NUM_KNOBS]);
+
   private:
 	int warming_up_step;
 	bool should_invalidate_cache, should_reevaluate_scores;
@@ -241,6 +244,10 @@ class NelderMead
 	double constraint_max[2];
 
 	double opt_weights[NMD_NUM_OBJECTIVES];
+
+	double next_constraint_min[NMD_NUM_KNOBS],
+			next_constraint_max[NMD_NUM_KNOBS];
+	bool should_update_constraints = false;
 };
 
 } // namespace components
diff --git a/allscale/components/scheduler.hpp b/allscale/components/scheduler.hpp
index 1ce0336..9eb9fbf 100644
--- a/allscale/components/scheduler.hpp
+++ b/allscale/components/scheduler.hpp
@@ -79,6 +79,8 @@ namespace allscale { namespace components {
                                          double *energy_weight,
                                          double *resource_weight);
         
+        void update_max_threads(std::size_t max_threads);
+
         double get_last_objective_score() {
                 return last_objective_score;
         }
diff --git a/allscale/scheduler.hpp b/allscale/scheduler.hpp
index 8cf6006..f448ad5 100644
--- a/allscale/scheduler.hpp
+++ b/allscale/scheduler.hpp
@@ -48,6 +48,7 @@ namespace allscale
 
         static HPX_EXPORT void update_policy(task_times const& times, std::vector<bool> mask, std::uint64_t frequency);
         static void apply_new_mapping(const std::vector<std::size_t> &new_mapping);
+        static void update_max_threads(std::size_t max_threads);
 
         static HPX_EXPORT void schedule(work_item&& work);
         static HPX_EXPORT components::scheduler* run(std::size_t rank);
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index 97f6dbd..1a313b1 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -88,7 +88,7 @@ void localoptimizer::printobjectives()
 {
 	std::cout << "[LocalOptimizer|DEBUG] Weights=[time:" << time_weight
 			  << ", energy:" << energy_weight
-			  << ", resource:" << resource_weight << "]" << std::endl;
+			  << ", resource:" << resource_weight << "]" << std::endl << std::flush;
 }
 #endif
 
@@ -166,25 +166,73 @@ void localoptimizer::setmaxthreads(std::size_t threads)
 	#else 
 		threads_dt = 1.;
 	#endif
-}
+	
+	if ( mo_initialized ) {
+		if ( converged_ == false ) {
+			initialize_nmd(true);
+		} else {
+			double factor;
+			int min_freq = 0;
+			int max_freq = frequencies_param_allowed_.size() - 1;
+
+			if ( time_weight >= energy_weight + resource_weight) {
+				factor = 0.5;
+				min_freq = frequencies_param_allowed_.size() / 4;
+			}		
+			else {
+				factor = 0.25;
+				max_freq = max_freq / 2;
+			}
+
+			int min_threads = factor * max_threads_/((double)threads_dt);
 
+			if ( min_threads < 1 )
+				min_threads = 1;
+			
+			double constraint_min[] = {min_threads, min_freq};
+			#if defined(ALLSCALE_HAVE_CPUFREQ)
+			double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
+									(double)max_freq};
+			#else 
+			std::cout << "Allowed frequencies: " << frequencies_param_allowed_.size() << std::endl;
+			double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
+									0.0};
+			#endif
+
+			nmd.update_constraints(constraint_min, constraint_max);
+		}
+	}
+}
 
 void localoptimizer::initialize_nmd(bool from_scratch)
 {
 	// VV: Place constraints to #threads and cpu_freq tunable knobs
-	int min_threads = 0.25 * max_threads_/((double)threads_dt);
+	double factor;
+	int min_freq = 0;
+	int max_freq = frequencies_param_allowed_.size() - 1;
+
+	if ( time_weight >= energy_weight + resource_weight) {
+		factor = 0.5;
+		min_freq = frequencies_param_allowed_.size() / 4;
+	}		
+	else {
+		factor = 0.25;
+		max_freq = max_freq / 2;
+	}
+
+	int min_threads = factor * max_threads_/((double)threads_dt);
 
 	if ( min_threads < 1 )
 		min_threads = 1;
 	
-	double constraint_min[] = {1, 0};
+	double constraint_min[] = {min_threads, min_freq};
 	#if defined(ALLSCALE_HAVE_CPUFREQ)
 	double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
-							   (double)frequencies_param_allowed_.size() - 1};
+							(double)max_freq};
 	#else 
 	std::cout << "Allowed frequencies: " << frequencies_param_allowed_.size() << std::endl;
 	double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
-						       0.0};
+							0.0};
 	#endif
 	const double opt_weights[] = { time_weight, energy_weight, resource_weight };
 
@@ -274,7 +322,7 @@ actuation localoptimizer::step(std::size_t active_threads)
 											pending_threads};
 		reset_accumulated_measurements();
 
-		if ( explore_knob_domain ){
+		if ( converged_ == false ){
 			optstepresult nmd_res = nmd.step(latest_measurements,
 											 active_threads,
 											 frequency_param_);
@@ -300,7 +348,6 @@ actuation localoptimizer::step(std::size_t active_threads)
 				act.frequency_idx = minimization_point[1];
 				
 				// VV: Stop searching for new knob_set
-				explore_knob_domain = false;
 				converged_ = true;
 			} else {
 				// VV: Have not converged yet, keep exploring
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index be46a41..db01d43 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -64,6 +64,8 @@ NelderMead::NelderMead(const NelderMead &other)
             initial_configurations[i][j] = other.initial_configurations[i][j];
         }
     }
+
+    should_update_constraints = true;
 }
 
 //NelderMead::NelderMead(double (*objfunc)(double[]),double eps){
@@ -437,6 +439,17 @@ void NelderMead::initialize_simplex(const double weights[3],
 }
 #endif
 
+void NelderMead::update_constraints(const double constraint_min[NMD_NUM_KNOBS],
+							    const double constraint_max[NMD_NUM_KNOBS])
+{
+    for (auto i=0; i<NMD_NUM_KNOBS; ++i) {
+        next_constraint_min[i] = constraint_min[i];
+        next_constraint_max[i] = constraint_max[i];
+    }
+
+    should_update_constraints = true;
+}
+
 /* FIXME: generalize */
 void NelderMead::initialize_simplex(const double weights[3],
                                     const double initial_simplex[][NMD_NUM_KNOBS],
@@ -446,11 +459,7 @@ void NelderMead::initialize_simplex(const double weights[3],
     int i, j;
     long timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
 
-    for (i = 0; i < NMD_NUM_KNOBS; i++)
-    {
-        this->constraint_min[i] = constraint_min[i];
-        this->constraint_max[i] = constraint_max[i];
-    }
+    update_constraints(constraint_min, constraint_max);
 
     OUT_DEBUG(
         std::cout << "[NelderMead|Debug] Initialize contraints " << std::endl;
@@ -459,7 +468,7 @@ void NelderMead::initialize_simplex(const double weights[3],
         std::cout << constraint_min[1] 
                     << ":" << constraint_max[1] << std::endl;
     )
-    
+
     set_weights(weights);
     state_ = warmup;
     itr = 0;
@@ -554,6 +563,7 @@ void NelderMead::print_initial_simplex()
                      << e->second.objectives[2] << " "
                      << std::endl;
         }
+        std::cout << std::flush;
     }
 }
 
@@ -1073,6 +1083,15 @@ optstepresult NelderMead::step(const double objectives[],
             << objectives[2] << " score " << score << std::endl;
     )
     
+    if ( should_update_constraints ) {
+        for (i=0; i<NMD_NUM_KNOBS; ++i )
+        {
+           constraint_min[i] = next_constraint_min[i];
+           constraint_max[i] = next_constraint_max[i];
+        }
+        should_update_constraints = false;
+    }
+
     std::size_t tested_combinations = cache_.size();
     
     #if 0
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 13cc0d7..44ec7fa 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -220,6 +220,7 @@ void scheduler::init() {
      allscale policy is the default */
     std::string input_optpolicy_str =
       hpx::get_config_entry("allscale.policy", "none");
+    uselopt=false;
 #ifdef DEBUG_MULTIOBJECTIVE_
     std::cout << "[Local Optimizer|INFO] Optimization Policy Active = " << input_optpolicy_str << std::endl;
 #endif
@@ -229,10 +230,19 @@ void scheduler::init() {
       lopt_.setPolicy(random);
   else if (input_optpolicy_str=="manual")
       lopt_.setPolicy(manual);
+  else if ( input_optpolicy_str == "none") {
+      char *c_optpolicy = std::getenv("ALLSCALE_LOCAL_OPTIMIZER");
+      if ( c_optpolicy && strcmp(c_optpolicy, "allscale") == 0 ) {
+          lopt_.setPolicy(allscale);
+          uselopt=true;
+      }
+  }
 	else if ( input_optpolicy_str != "none" ) {
 		HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init", 
 							"unknown allscale.policy");
 	}
+
+
 #ifdef MEASURE_MANUAL_
   std::string input_osthreads_str =
       hpx::get_config_entry("allscale.osthreads", "");
@@ -254,6 +264,12 @@ void scheduler::init() {
   }
 #endif
 
+  if (input_objective_str.empty() ){
+    char *c_opt_objective = std::getenv("ALLSCALE_LOCAL_OBJECTIVE");
+    if ( c_opt_objective )
+      input_objective_str = std::string(c_opt_objective);
+  }
+
   if (!input_objective_str.empty()) {
     uselopt=true;
     std::istringstream iss_leeways(input_objective_str);
@@ -711,6 +727,18 @@ void scheduler::optimize_locally(work_item const& work)
   }
 }
 
+
+void scheduler::update_max_threads(std::size_t max_threads)
+{
+  std::cout << "Will try to set max threads to " << max_threads <<std::endl;
+  if (uselopt)
+    lopt_.setmaxthreads(max_threads);
+  else if (active_threads > max_threads )
+    suspend_threads(active_threads - max_threads);
+  else if ( active_threads < max_threads )
+    resume_threads(max_threads - active_threads);
+}
+
 void scheduler::set_local_optimizer_weights(double time_weight, 
                                          double energy_weight,
                                          double resource_weight)
diff --git a/src/dashboard.cpp b/src/dashboard.cpp
index ae5320b..52a6890 100644
--- a/src/dashboard.cpp
+++ b/src/dashboard.cpp
@@ -226,18 +226,6 @@ namespace allscale { namespace dashboard
 
             const char* host_env = std::getenv(ENVVAR_DASHBOARD_IP);
             const char* port_env = std::getenv(ENVVAR_DASHBOARD_PORT);
-            char *const c_policy = std::getenv("ALLSCALE_SCHEDULING_POLICY");
-            std::string input_objective_str = hpx::get_config_entry("allscale.objective", "");
-
-            if (c_policy && strcasecmp(c_policy, "ino") == 0 )
-                use_gopt = true;
-            else
-                use_gopt = false;
-            
-            if ( input_objective_str == "allscale" )
-                use_lopt = true;
-            else
-                use_lopt = false;
             
             std::string host;
             if (host_env)
diff --git a/src/optimizer.cpp b/src/optimizer.cpp
index 1733351..48312c2 100644
--- a/src/optimizer.cpp
+++ b/src/optimizer.cpp
@@ -100,11 +100,15 @@ namespace allscale
         scheduler::apply_new_mapping(new_mapping);
     }
 
+    void optimizer_update_max_threads(std::size_t max_threads) {
+        scheduler::update_max_threads(max_threads);
+    }
 } // namespace allscale
 
 HPX_PLAIN_DIRECT_ACTION(allscale::get_optimizer_state, allscale_get_optimizer_state_action);
 HPX_PLAIN_DIRECT_ACTION(allscale::optimizer_update_policy, allscale_optimizer_update_policy_action);
 HPX_PLAIN_DIRECT_ACTION(allscale::optimizer_update_policy_ino, allscale_optimizer_update_policy_action_ino);
+HPX_PLAIN_DIRECT_ACTION(allscale::optimizer_update_max_threads, allscale_optimizer_update_max_threads);
 
 namespace allscale
 {
@@ -757,7 +761,7 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                         )
                     }
 
-                    if (previous_num_nodes != new_num_nodes ){
+                    if (previous_num_nodes != new_num_nodes ) {
                         OUT_DEBUG(
                             std::cout << "[GLOBAL OPTIMIZER] Rebalancing (NEW):" << std::endl;
 
@@ -775,6 +779,7 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                         previous_num_nodes = new_num_nodes;
                         hpx::lcos::broadcast_apply<allscale_optimizer_update_policy_action_ino>(localities_, new_mapping);
                     }
+                    hpx::lcos::broadcast_apply<allscale_optimizer_update_max_threads>(localities_, new_threads_per_node);
                 }
             });
 }
diff --git a/src/scheduler.cpp b/src/scheduler.cpp
index fcb70c4..f1be5bc 100644
--- a/src/scheduler.cpp
+++ b/src/scheduler.cpp
@@ -304,6 +304,8 @@ namespace allscale
           , right_id_(std::move(other.right_id_))
           , is_root_(other.is_root_)
           , optimizer_(std::move(other.optimizer_))
+          , use_gopt(other.use_gopt)
+          , use_lopt(other.use_lopt)
         {
             HPX_ASSERT(false);
         }
@@ -316,6 +318,20 @@ namespace allscale
           , parent_(here_.getParent())
           , is_root_(here_ == root_)
         {
+            char *const c_policy = std::getenv("ALLSCALE_SCHEDULING_POLICY");
+            std::string input_objective_str = hpx::get_config_entry("allscale.objective", "");
+
+            if (c_policy && strcasecmp(c_policy, "ino") == 0 )
+                use_gopt = true;
+            else
+                use_gopt = false;
+            
+            if ( input_objective_str == "allscale" )
+                use_lopt = true;
+            else
+                use_lopt = false;
+
+
             if (parent_.getRank() != scheduler::rank())
             {
                 parent_id_ = hpx::naming::get_id_from_locality_id(
@@ -343,7 +359,7 @@ namespace allscale
 
             if (is_root_) run();
         }
-
+        
         std::string policy()
         {
             return policy_.policy();
@@ -356,6 +372,11 @@ namespace allscale
                 tree_scheduling_policy::from_mapping(*policy_.policy_, new_mapping);
         }
 
+        void update_max_threads(std::size_t max_threads) {
+            auto &&local_scheduler = scheduler::get();
+            local_scheduler.update_max_threads(max_threads);
+        }
+
         void toggle_node(std::size_t locality_id)
         {
             {
@@ -455,6 +476,7 @@ namespace allscale
             );
         }
 
+        bool use_gopt, use_lopt;
 
         void set_policy(std::string policy)
         {
@@ -844,6 +866,16 @@ namespace allscale
         monitor::get().set_cur_freq(freq);
     }
 
+    void scheduler::update_max_threads(std::size_t max_threads)
+    {
+        runtime::HierarchicalOverlayNetwork::forAllLocal<scheduler_service>(
+            [&](scheduler_service& sched)
+            {
+                sched.update_max_threads(max_threads);
+            }
+        );
+    }
+
     void scheduler::apply_new_mapping(const std::vector<std::size_t> &new_mapping)
     {
         runtime::HierarchicalOverlayNetwork::forAllLocal<scheduler_service>(

From 76afbda2ecf46adc1054c955ee68e2212084fc68 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Tue, 20 Nov 2018 18:27:51 +0000
Subject: [PATCH 25/37] Put an upper limit to how many times NMD is allowed to
 recurse due to measurement information that's cached

---
 allscale/components/nmsimplex_bbincr.hpp |  2 +
 src/components/nmsimplex_bbincr.cpp      | 52 ++++++++++++++++--------
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp
index 974a0c1..11e5d09 100644
--- a/allscale/components/nmsimplex_bbincr.hpp
+++ b/allscale/components/nmsimplex_bbincr.hpp
@@ -248,6 +248,8 @@ class NelderMead
 	double next_constraint_min[NMD_NUM_KNOBS],
 			next_constraint_max[NMD_NUM_KNOBS];
 	bool should_update_constraints = false;
+
+	int times_used_cached;
 };
 
 } // namespace components
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index db01d43..4fe6de5 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -66,6 +66,7 @@ NelderMead::NelderMead(const NelderMead &other)
     }
 
     should_update_constraints = true;
+    times_used_cached = 0;
 }
 
 //NelderMead::NelderMead(double (*objfunc)(double[]),double eps){
@@ -551,13 +552,13 @@ void NelderMead::print_initial_simplex()
         const int freq_idx = (int) v[j][1];
 
         auto e = cache_.find(std::make_pair(threads, freq_idx));
-        std::cout << " Objective value = " << f[j];
+        std::cout << " Objective value = "<< std::flush << f[j] << std::flush;
 
         if ( e == cache_.end() )
         {
-            std::cout << " (not in cache)" << std::endl;
+            std::cout << " (not in cache)" << std::flush << std::endl;
         } else {
-            std::cout << " OBJs: "
+            std::cout << " OBJs: " << std::flush
                      << e->second.objectives[0] << " "
                      << e->second.objectives[1] << " "
                      << e->second.objectives[2] << " "
@@ -598,10 +599,10 @@ void NelderMead::centroid()
     int j, m;
     double cent;
 
-    for (j = 0; j <= n - 1; j++)
+    for (j = 0; j < NMD_NUM_KNOBS; j++)
     {
         cent = 0.0;
-        for (m = 0; m <= n; m++)
+        for (m = 0; m < NMD_NUM_KNOBS +1; m++)
         {
             if (m != vg)
             {
@@ -611,6 +612,8 @@ void NelderMead::centroid()
         vm[j] = cent / n;
     }
 
+    my_constraints(vm);
+
     OUT_DEBUG (
         std::cout << "[NelderMead|DEBUG] New Centroid: " 
         << vm[0] << " " << vm[1] << std::endl;
@@ -621,7 +624,7 @@ void NelderMead::sort_vertices()
 {
     // VV: -1 is used for padding because the index to this map will never evaluate to 0
     int map_to_index[] = {
-        -1, 0, 1, 0, 2, 0, 0, 0};
+        0, 0, 1, 0, 2, 0, 0, 0};
 
     vg = vs = vh = 0;
 
@@ -650,6 +653,7 @@ void NelderMead::sort_vertices()
 optstepresult NelderMead::do_step_start()
 {
     optstepresult res;
+    times_used_cached ++;
 
     OUT_DEBUG(
         std::cout << "[NelderMead DEBUG] State = Start" << std::endl;
@@ -687,7 +691,8 @@ optstepresult NelderMead::do_step_start()
 
     auto entry = cache_.find(key);
 
-    if (entry != cache_.end())
+    //VV: Fixme, remove recursion due to cache
+    if (entry != cache_.end() && times_used_cached < 15)
     {
         auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
         auto dt = timestamp_now - entry->second._cache_timestamp;
@@ -1073,6 +1078,7 @@ optstepresult NelderMead::step(const double objectives[],
     optstepresult res;
     res.threads = 0;
     res.freq_idx = -1;
+    times_used_cached = 0;
 
     OUT_DEBUG(
         auto score = evaluate_score(objectives, nullptr);
@@ -1171,7 +1177,6 @@ optstepresult NelderMead::step(const double objectives[],
             std::cout << "[NelderMead|Warn] Unknown warmup step " << warming_up_step << std::endl;
         }
 
-        optstepresult res;
         res.objectives[0] = -1;
         res.objectives[1] = -1;
         res.objectives[2] = -1;
@@ -1184,7 +1189,7 @@ optstepresult NelderMead::step(const double objectives[],
         v[warming_up_step][1] = res.freq_idx;
         warming_up_step++;
 
-        return res;
+        break;
     }
     break;
     case start:
@@ -1209,16 +1214,29 @@ optstepresult NelderMead::step(const double objectives[],
         return res;
     }
 
-    res.converged = testConvergence(tested_combinations);
-
-    if (res.converged == true)
+    if ( state_ != warmup )
     {
-        res.threads = v[vs][0];
-        res.freq_idx = v[vs][1];
-        OUT_DEBUG(
-            std::cout << "[NelderMead|DEBUG] Converged to " << res.threads << " " << res.freq_idx << std::endl;
-        )
+        res.converged = testConvergence(tested_combinations);
+
+        if (res.converged == true)
+        {
+            res.threads = v[vs][0];
+            res.freq_idx = v[vs][1];
+            OUT_DEBUG(
+                std::cout << "[NelderMead|DEBUG] Converged to " << res.threads << " " << res.freq_idx << std::endl;
+            )
+        }
     }
+    
+    if ( res.threads > constraint_max[0])
+        res.threads = (int) constraint_max[0];
+    else if ( res.threads < constraint_min[0])
+        res.threads = (int) constraint_min[0];
+
+    if ( res.freq_idx > constraint_max[1])
+        res.freq_idx = (int) constraint_max[1];
+    else if ( res.freq_idx < constraint_min[1])
+        res.freq_idx = (int) constraint_min[1];
 
     std::cout << "Stop step with "
                 << objectives[0] << " " 

From 546a6cfbe9b1d8d7934edbfae168f15f2de36149 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Wed, 21 Nov 2018 09:04:12 +0000
Subject: [PATCH 26/37] When re-evaluating scores re-pick the top
 NMD_NUM_KNOBS+1 configurations for the NelderMead algorithm

---
 src/components/nmsimplex_bbincr.cpp | 50 +++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 4fe6de5..6f2126a 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -9,7 +9,8 @@
  * function with complex analytical evaluation)
  *
  */
-
+#include <vector>
+#include <algorithm>
 #include <allscale/components/nmsimplex_bbincr.hpp>
 #include <cmath>
 
@@ -323,19 +324,48 @@ void NelderMead::do_reevaluate_scores()
 {
     auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
 
-    for (auto i=0ul; i<NMD_NUM_KNOBS+1; ++i )
-    {
-        auto key = std::make_pair( (int)v[i][0], (int)v[i][1] );
-        auto entry = cache_.find(key);
+    std::vector<optstepresult> fresh;
+    should_reevaluate_scores = false;
 
-        if ( entry != cache_.end() ) {
-            f[i] = evaluate_score(entry->second.objectives, opt_weights);
-        }
+    for ( const auto &entry: cache_ ) {
+        auto dt = timestamp_now - entry.second._cache_timestamp;
+        if ( dt <= entry.second._cache_expires_dt )
+            fresh.push_back(entry.second);
     }
 
-    should_reevaluate_scores = false;
+    if ( fresh.size() >= NMD_NUM_KNOBS +1 ) {
+        std::sort(fresh.begin(), fresh.end(), 
+            [this](const optstepresult &l, const optstepresult &r) mutable -> int {
+                return evaluate_score(l.objectives, nullptr) < 
+                        evaluate_score(r.objectives, nullptr);
+            });
+        
+        for (auto i=0ul; i<NMD_NUM_KNOBS+1; ++i ) {
+            v[i][0] = fresh[i].threads;
+            v[i][1] = fresh[i].freq_idx;
+        }
 
-    sort_vertices();
+        vs = 0;
+        vh = 1;
+        vg = 2;
+    }
+    else
+    {
+        for (auto i=0ul; i<NMD_NUM_KNOBS+1; ++i )
+        {
+            auto key = std::make_pair( (int)v[i][0], (int)v[i][1] );
+            auto entry = cache_.find(key);
+
+            if ( entry != cache_.end() ) {
+                f[i] = evaluate_score(entry->second.objectives, opt_weights);
+            }
+        }
+        sort_vertices();
+    }
+    OUT_DEBUG(
+            std::cout << "[NelderMead|DEBUG] Re-Evaluated all scores" << std::endl;
+            print_initial_simplex();
+        )
     centroid();
 }
 

From 56f3529dbcfe487a73224ed65e6848bd28b603d7 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Thu, 22 Nov 2018 10:33:25 +0000
Subject: [PATCH 27/37] Bugfix

Faking a list of allowed cpufrequencies even when CPUFREQ is not present
---
 src/components/scheduler_component.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 44ec7fa..2b865bf 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -220,6 +220,13 @@ void scheduler::init() {
      allscale policy is the default */
     std::string input_optpolicy_str =
       hpx::get_config_entry("allscale.policy", "none");
+      if ( input_optpolicy_str == "none" ){
+        char *c_optpolicy = std::getenv("ALLSCALE_LOCAL_OPTIMIZER");
+        if ( c_optpolicy) 
+          input_optpolicy_str = std::string(c_optpolicy);
+      }
+
+
     uselopt=false;
 #ifdef DEBUG_MULTIOBJECTIVE_
     std::cout << "[Local Optimizer|INFO] Optimization Policy Active = " << input_optpolicy_str << std::endl;
@@ -230,14 +237,7 @@ void scheduler::init() {
       lopt_.setPolicy(random);
   else if (input_optpolicy_str=="manual")
       lopt_.setPolicy(manual);
-  else if ( input_optpolicy_str == "none") {
-      char *c_optpolicy = std::getenv("ALLSCALE_LOCAL_OPTIMIZER");
-      if ( c_optpolicy && strcmp(c_optpolicy, "allscale") == 0 ) {
-          lopt_.setPolicy(allscale);
-          uselopt=true;
-      }
-  }
-	else if ( input_optpolicy_str != "none" ) {
+ 	else if ( input_optpolicy_str != "none" ) {
 		HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init", 
 							"unknown allscale.policy");
 	}
@@ -317,6 +317,7 @@ void scheduler::init() {
                   << std::flush;
 #endif
       } else {
+        std::cout << "TRIED PARSING \"" << obj << "\"" << std::endl;
         HPX_THROW_EXCEPTION(
             hpx::bad_request, "scheduler::init",
             boost::str(
@@ -415,6 +416,7 @@ void scheduler::init() {
     #else
     // VV: Max number of threads, and an arbitrary frequency index
     lopt_.reset(os_thread_count,0);
+    auto freq_temp = lopt_.setfrequencies({0});
     #endif
     
     // VV: Set objectives after setting all constraints to

From 6702140d7adb5fe752599710bfaacf0edb3dccf9 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Thu, 22 Nov 2018 13:56:35 +0000
Subject: [PATCH 28/37] Bugfix

Setting initial configuration was not taking into account that the
constraints will change in a future step
---
 allscale/components/localoptimizer.hpp |  5 +++--
 src/components/localoptimizer.cpp      | 29 +++++++++++++++++++++-----
 src/components/nmsimplex_bbincr.cpp    | 26 +++++++++++++++--------
 src/optimizer.cpp                      |  4 +++-
 4 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index c5e6afc..96d1f5f 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -35,7 +35,8 @@ enum searchPolicy
 {
 	allscale,
 	random,
-	manual
+	manual,
+	none
 };
 
 
@@ -187,7 +188,7 @@ struct localoptimizer
 	NelderMead nmd;
 
 	/* single objective optimization method used */
-	searchPolicy optmethod_ = random;
+	searchPolicy optmethod_ = none;
 
 	/* active optimization parameter - nr of OS threads active */
 	int threads_param_;
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index 1a313b1..7ea0f67 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -224,7 +224,8 @@ void localoptimizer::initialize_nmd(bool from_scratch)
 
 	if ( min_threads < 1 )
 		min_threads = 1;
-	
+	int max_threads = max_threads_;
+
 	double constraint_min[] = {min_threads, min_freq};
 	#if defined(ALLSCALE_HAVE_CPUFREQ)
 	double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
@@ -248,10 +249,28 @@ void localoptimizer::initialize_nmd(bool from_scratch)
 								constraint_min, 
 								constraint_max);
 	} else {
-		nmd.initialize_simplex(opt_weights,
-								nullptr,
-								constraint_min, 
-								constraint_max);
+		if ( time_weight >= energy_weight + resource_weight ) {
+			double initial_simplex[3][2] = {
+				{min_threads, constraint_min[1]},
+				{max_threads, constraint_max[1]},
+				{(min_threads+max_threads)/2., constraint_max[1]}
+			};
+			nmd.initialize_simplex(opt_weights,
+									initial_simplex,
+									constraint_min, 
+									constraint_max);
+		} else {
+			double initial_simplex[3][2] = {
+				{min_threads, constraint_min[1]},
+				{max_threads, constraint_min[1]},
+				{(min_threads+max_threads)/2., constraint_max[1]}
+			};
+
+			nmd.initialize_simplex(opt_weights,
+									initial_simplex,
+									constraint_min, 
+									constraint_max);
+		}
 	}
 
 	mo_initialized = true;
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 6f2126a..850f439 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -227,11 +227,13 @@ void NelderMead::generate_new(F &gen)
             dir = logistics.second;
 
             #endif
+            /*
             OUT_DEBUG(
                 std::cout << "[NelderMead|Debug] Rejecting " 
                     << new_set[0] << " " << new_set[1] 
                     << " will try offset " << extra[0] << " " << extra[1] <<  std::endl;
             )
+            */
         } else {
             break;
         }
@@ -395,7 +397,7 @@ double NelderMead::evaluate_score(const double objectives[], const double *weigh
     #else 
     score = 1.0;
     for ( auto i=0; i<NMD_NUM_OBJECTIVES; ++ i) {
-        score *= std::exp(weights[i]*objectives[i]/scale[i]);
+        score *= std::exp((objectives[i]/scale[i]) * weights[i]);
     }
     #endif
     return score;
@@ -490,7 +492,11 @@ void NelderMead::initialize_simplex(const double weights[3],
     int i, j;
     long timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
 
-    update_constraints(constraint_min, constraint_max);
+    for (i=0; i<NMD_NUM_KNOBS; ++i )
+    {
+        this->constraint_min[i] = constraint_min[i];
+        this->constraint_max[i] = constraint_max[i];
+    }
 
     OUT_DEBUG(
         std::cout << "[NelderMead|Debug] Initialize contraints " << std::endl;
@@ -541,13 +547,6 @@ void NelderMead::initialize_simplex(const double weights[3],
                 }
 
             } while (is_ok == 0);
-
-            OUT_DEBUG(
-                std::cout << "[NelderMead|DEBUG] Random initial simplex [" << i << "]: ";
-                for ( j =0; j<NMD_NUM_KNOBS; ++j) 
-                    std::cout << initial_configurations[i][j] << " ";
-                std::cout << std::endl;
-            )
         }
         #endif
     } else {
@@ -562,6 +561,15 @@ void NelderMead::initialize_simplex(const double weights[3],
             }
         }
     }
+
+    OUT_DEBUG(
+        for (auto i=0; i<NMD_NUM_KNOBS+1; ++i) {
+            std::cout << "[NelderMead|DEBUG] (initialize) initial simplex [" << i << "]: ";
+            for ( j =0; j<NMD_NUM_KNOBS; ++j) 
+                std::cout << initial_configurations[i][j] << " ";
+            std::cout << std::endl;
+        }
+    )
 }
 
 /* print out the initial values */
diff --git a/src/optimizer.cpp b/src/optimizer.cpp
index 48312c2..a67428f 100644
--- a/src/optimizer.cpp
+++ b/src/optimizer.cpp
@@ -779,7 +779,9 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                         previous_num_nodes = new_num_nodes;
                         hpx::lcos::broadcast_apply<allscale_optimizer_update_policy_action_ino>(localities_, new_mapping);
                     }
-                    hpx::lcos::broadcast_apply<allscale_optimizer_update_max_threads>(localities_, new_threads_per_node);
+
+                    if ( threads_min != threads_max )
+                        hpx::lcos::broadcast_apply<allscale_optimizer_update_max_threads>(localities_, new_threads_per_node);
                 }
             });
 }

From 0d8e1f8b2c993563e36d66ee5ae006c1b3771254 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Thu, 22 Nov 2018 14:19:50 +0000
Subject: [PATCH 29/37] Modified initial simplex

---
 src/components/localoptimizer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index 7ea0f67..d002112 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -252,7 +252,7 @@ void localoptimizer::initialize_nmd(bool from_scratch)
 		if ( time_weight >= energy_weight + resource_weight ) {
 			double initial_simplex[3][2] = {
 				{min_threads, constraint_min[1]},
-				{max_threads, constraint_max[1]},
+				{max_threads/2.0, (constraint_min[1]+constraint_max[1])/2.0},
 				{(min_threads+max_threads)/2., constraint_max[1]}
 			};
 			nmd.initialize_simplex(opt_weights,
@@ -262,7 +262,7 @@ void localoptimizer::initialize_nmd(bool from_scratch)
 		} else {
 			double initial_simplex[3][2] = {
 				{min_threads, constraint_min[1]},
-				{max_threads, constraint_min[1]},
+				{max_threads/2.0, (constraint_min[1]+constraint_max[1])/2.0},
 				{(min_threads+max_threads)/2., constraint_max[1]}
 			};
 

From 38d78f90264238e46caa238966215019c2d63546 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Thu, 22 Nov 2018 14:36:43 +0000
Subject: [PATCH 30/37] Final exploration will re-use even stale chache entries

---
 src/components/nmsimplex_bbincr.cpp | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 850f439..851156d 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -1382,14 +1382,36 @@ bool NelderMead::testConvergence(std::size_t tested_combinations)
         warming_up_step = 0;
         itr --;
         convergence_reevaluating = true;
+        std::vector<optstepresult> fresh;
+
+        for ( const auto &entry: cache_ ) {
+            fresh.push_back(entry.second);
+        }
+
         cache_.clear();
 
+        std::sort(fresh.begin(), fresh.end(), 
+            [this](const optstepresult &l, const optstepresult &r) mutable -> int {
+                return evaluate_score(l.objectives, nullptr) < 
+                        evaluate_score(r.objectives, nullptr);
+            });
+
+        for (auto i=0ul; i<NMD_NUM_KNOBS+1; ++i ) {
+            v[i][0] = fresh[i].threads;
+            v[i][1] = fresh[i].freq_idx;
+        }
+
+        vs = 0;
+        vh = 1;
+        vg = 2;
+
         for (auto i=0; i<NMD_NUM_KNOBS+1; ++i ) {
             for (auto j=0; j<NMD_NUM_KNOBS; ++j) {
                 initial_configurations[i][j] = v[i][j];
             }
         }
-
+        centroid();
+        
         OUT_DEBUG (
             print_initial_simplex();
         )

From 78ec44b5eaf13788c2a880b903fa8020d8780fd8 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Sun, 25 Nov 2018 21:02:50 +0000
Subject: [PATCH 31/37] INO_NMD now toggles active_nodes_ plus Generic
 implementation of NMD

---
 allscale/components/nmd.hpp | 141 ++++++
 src/components/nmd.cpp      | 871 ++++++++++++++++++++++++++++++++++++
 src/optimizer.cpp           |  18 +-
 3 files changed, 1027 insertions(+), 3 deletions(-)
 create mode 100644 allscale/components/nmd.hpp
 create mode 100644 src/components/nmd.cpp

diff --git a/allscale/components/nmd.hpp b/allscale/components/nmd.hpp
new file mode 100644
index 0000000..4f18ea2
--- /dev/null
+++ b/allscale/components/nmd.hpp
@@ -0,0 +1,141 @@
+/*
+Nelder Mead implementation for arbitrary number of knobs and number of objectives.
+
+Developed explicitly for non-continuous search spaces.
+
+Important information
+---------------------
+
+This implementation uses a cache coupled with the exploration-heuristic that is explained
+bellow to refrain from evaluating the same set of knobs multiple times.
+
+If NMD proposes to explore a knob-set that has been recently evaluated (i.e. there's a
+non stale entry in the cache) the heuristic will instead propose the closest point that is
+enclosed within the N-dimensional (where N = num_knobs) space near the knob set that NMD
+initially proposed. The N-dimensional space takes a form of a square, Cube, Hypercube for
+N=2, 3, 4. Each edge may be at most @max_distance_long (see generate_unique) for more info.
+
+author: vasiliadis.vasilis@gmail.com
+*/
+#ifndef ALLSCALE_NMD_HEADER
+#include <cstddef>
+#include <map>
+#include <set>
+#include <vector>
+
+namespace allscale {
+namespace components {
+
+struct logistics {
+    std::vector<double> objectives;
+    std::vector<std::size_t> knobs;
+
+    int64_t cache_ts, cache_dt;
+
+    bool converged;
+};
+
+#define ALPHA 1.0   /* reflection coefficient */
+#define BETA 0.5	/* contraction coefficient */
+#define GAMMA 2.0   /* expansion coefficient */
+#define DELTA 0.5   /* shrinking coefficient */
+
+class NmdGeneric {
+public:
+    NmdGeneric();
+    NmdGeneric(std::size_t num_knobs, std::size_t num_objectives,
+               double conv_threshold, int64_t cache_expire_dt_ms,
+               std::size_t max_iters);
+    NmdGeneric(const NmdGeneric& other);
+
+    void initialize(std::size_t constraint_min[], std::size_t constraint_max[],
+                    std::size_t *initial_config[], double weights[]);
+
+    void ensure_profile_consistency(std::size_t expected[], const std::size_t observed[]) const;
+
+    void set_constraints_now(std::size_t constraint_min[], std::size_t constraint_max[]);
+
+    double score(const double measurements[]) const;
+
+    std::pair<std::vector<std::size_t>, bool> get_next(const double measurements[], 
+                            std::size_t observed_knobs[]);
+
+// protected:
+    bool test_convergence();
+    std::vector<std::size_t> do_warmup(const double measurements[], 
+                            std::size_t observed_knobs[]);
+    std::vector<std::size_t> do_reflect(const double measurements[], 
+                            std::size_t observed_knobs[]);
+    std::vector<std::size_t> do_expand(const double measurements[], 
+                            std::size_t observed_knobs[]);
+    std::vector<std::size_t> do_contract_in(const double measurements[], 
+                            std::size_t observed_knobs[]);
+        std::vector<std::size_t> do_contract_out(const double measurements[], 
+                            std::size_t observed_knobs[]);
+    std::vector<std::size_t> do_shrink();
+    std::vector<std::size_t> do_start(bool consult_cache);
+
+    void sort_simplex(bool consult_cache=true);
+    void compute_centroid();
+
+    void generate_unique(std::size_t initial[], bool accept_stale,
+                        const std::set<std::vector<std::size_t> > *extra) const;
+    std::size_t compute_max_combinations() const;
+
+    template<typename T>
+    void apply_constraint(T knobs[]) const
+    {
+        for (auto i=0ul; i<num_knobs; ++i) {
+            if ( knobs[i] < (T) constraint_min[i] )
+                knobs[i] = constraint_min[i];
+            if ( knobs[i] > (T) constraint_max[i] )
+                knobs[i] = constraint_max[i];
+        }
+    }
+
+    //VV: Used to generate all possible combinations of +-
+    // from: https://stackoverflow.com/questions/4633584/
+    template <typename Iter>
+    bool next_binary(Iter begin, Iter end) const
+    {
+        while (begin != end)       // we're not done yet
+        {
+            --end;
+            if ((*end & 1) == 0)   // even number is treated as zero
+            {
+                ++*end;            // increase to one
+                return true;       // still more numbers to come
+            }
+            else                   // odd number is treated as one
+            {
+                --*end;            // decrease to zero and loop
+            }
+        }
+        return false;              // that was the last number 
+    }
+
+    enum estate {warmup, start, reflect, expand, contract_in, contract_out, shrink};
+    estate current_state;
+    std::size_t warmup_step;
+
+    double conv_threshold;
+    std::size_t num_knobs;
+    std::size_t num_objectives;
+
+    double *scores;
+    std::size_t **simplex, **initial_config;
+    std::size_t *constraint_max, *constraint_min;
+    std::size_t *point_reflect, *point_contract, *point_expand, *centroid;
+    std::map< std::vector<std::size_t>, logistics> cache;
+    int64_t cache_expire_dt_ms;
+    double *weights;
+    std::size_t times_reentered_start;
+    double score_reflect, score_contract, score_expand;
+    bool final_explore;
+    std::size_t iteration, max_iters;
+};
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/src/components/nmd.cpp b/src/components/nmd.cpp
new file mode 100644
index 0000000..9bae1cf
--- /dev/null
+++ b/src/components/nmd.cpp
@@ -0,0 +1,871 @@
+#include <iostream>
+#include <chrono>
+#include <cstdlib>
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <allscale/components/nmd.hpp>
+
+
+// #define NMD_DEBUG_
+// #define NMD_INFO_
+
+#ifdef NMD_DEBUG_
+#define OUT_DEBUG(X) X
+#ifndef NMD_INFO_
+    #define NMD_INFO_
+#endif
+#else
+#define OUT_DEBUG(X) {}
+#endif
+
+#if defined(NMD_INFO_)
+#define OUT_INFO(X) X
+#else
+#define OUT_INFO(X) {}
+#endif
+
+
+using namespace allscale::components;
+
+NmdGeneric::NmdGeneric()
+:
+current_state(warmup), warmup_step(0), 
+conv_threshold(0), num_knobs(0), num_objectives(0), 
+scores(nullptr), simplex(nullptr), initial_config(nullptr),
+constraint_max(nullptr), constraint_min(nullptr),
+point_reflect(nullptr), point_contract(nullptr), weights(nullptr)
+{}
+
+NmdGeneric::NmdGeneric(std::size_t num_knobs, 
+                        std::size_t num_objectives, 
+                        double conv_threshold,
+                        int64_t cache_expire_dt_ms,
+                        std::size_t max_iters)
+: conv_threshold(conv_threshold), num_knobs(num_knobs), 
+num_objectives(num_objectives), 
+cache_expire_dt_ms(cache_expire_dt_ms),
+final_explore(false),
+max_iters(max_iters)
+{
+    scores = new double [num_knobs+1];
+    centroid = new std::size_t [num_knobs];
+    simplex = new std::size_t* [num_knobs+1];
+    initial_config = new std::size_t* [num_knobs+1];
+
+    for (auto i=0ul; i<num_knobs+1; ++i) {
+        simplex[i] = new std::size_t [num_knobs];
+        initial_config[i] = new std::size_t [num_knobs];
+    }
+
+    constraint_max = new std::size_t [num_knobs];
+    constraint_min = new std::size_t [num_knobs];
+
+    point_reflect = new std::size_t [num_knobs];
+    point_contract = new std::size_t [num_knobs];
+    point_expand = new std::size_t [num_knobs];
+
+    weights = new double [num_objectives];
+}
+
+NmdGeneric::NmdGeneric(const NmdGeneric& other)
+{
+
+}
+
+double NmdGeneric::score(const double measurements[]) const
+{
+    double ret = std::pow(measurements[0], weights[0]) *
+                std::pow(measurements[1], weights[1]) *
+                std::pow((1-measurements[2]), weights[2]);
+    
+    if ( std::isfinite(ret) == 0  || ret > 1.0 ) {
+        ret = 1.0;
+    }
+    return 1.0 - ret;
+}
+
+void NmdGeneric::initialize(std::size_t constraint_min[], std::size_t constraint_max[],
+                    std::size_t *initial_config[], double weights[])
+{
+    for (auto i=0ul; i<num_objectives; ++i)
+        this->weights[i] = weights[i];
+
+    set_constraints_now(constraint_min, constraint_max);
+
+    iteration = 0;
+
+    if ( initial_config == nullptr ) {
+        std::set<std::vector<std::size_t> > fake;
+
+        for (auto i=0ul; i<num_knobs+1; ++i ) {
+            for ( auto j=0ul; j<num_knobs; ++j ) {
+                auto width = constraint_max[j] - constraint_min[j] + 1;
+                this->initial_config[i][j] = std::rand() % width + constraint_min[j];
+            }
+
+            generate_unique(this->initial_config[i], true, &fake);
+            auto new_key = std::vector<std::size_t>();
+            new_key.assign(this->initial_config[i], this->initial_config[i]+num_knobs);
+            fake.insert(new_key);
+        }
+    } else {
+        for (auto i=0ul; i<num_knobs+1; ++i )
+            for (auto j=0ul; j<num_knobs; ++j )
+                this->initial_config[i][j] = initial_config[i][j];
+    }
+
+    current_state = warmup;
+    warmup_step = 0;
+
+    OUT_INFO(
+        for (auto i=0ul; i<num_knobs+1; ++i ) {
+            std::cout << "[NMD|Info] Initial config " << i << " : ";
+            for (auto j=0ul; j<num_knobs; ++j )
+                std::cout << this->initial_config[i][j] << " ";
+            std::cout << std::endl;
+        }
+    )   
+
+    final_explore = false;
+    times_reentered_start = 0;
+}
+
+void NmdGeneric::set_constraints_now(std::size_t constraint_min[],
+                                    std::size_t constraint_max[])
+{
+    for (auto i=0ul; i<num_knobs; ++i ){
+        this->constraint_max[i] = constraint_max[i];
+        this->constraint_min[i] = constraint_min[i];
+    }
+}
+
+void NmdGeneric::generate_unique(std::size_t initial[], bool accept_stale=false, 
+                                const std::set<std::vector<std::size_t> > *extra=nullptr) const
+{
+    const auto ts_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    auto explored = (std::size_t) std::count_if(cache.begin(), cache.end(), [ts_now, accept_stale](const auto &entry) {
+        auto dt = ts_now - entry.second.cache_ts;
+        return accept_stale || dt < entry.second.cache_dt;
+    });
+
+    auto max_comb = compute_max_combinations();
+
+    if ( max_comb > explored && max_comb - explored > 1 ) {
+        // VV: TODO Optimize check_novel(). Currently, large "max_distance" values 
+        //     may result in extreme overheads
+        const auto max_distance = 3ul;
+        int64_t temp[num_knobs];
+        std::set< std::vector<std::size_t> > candidates;
+
+        auto check_novel = [this, &ts_now, &candidates, &accept_stale, &extra](int64_t knobs[]) mutable -> void {
+            apply_constraint(knobs);
+
+            auto key = std::vector<std::size_t>();
+
+            key.assign(knobs, knobs+num_knobs);
+            auto entry = cache.find(key);
+            if ( extra == nullptr || extra->find(key) == extra->end()) {
+                if ( entry == cache.end() ) {
+                    candidates.insert(key);
+                } else {
+                    auto dt = ts_now - entry->second.cache_ts;
+                    if (accept_stale==false || 
+                        (dt >= entry->second.cache_dt && cache_expire_dt_ms > 0) ) {
+                        candidates.insert(key);
+                    }
+                }
+            }
+        };
+
+        auto counters = std::vector<std::size_t>(num_knobs, 0ul);
+
+        bool done = false;
+
+        while ( done == false ) {
+            // VV: Generate all possible permutations
+            auto ops = std::string(num_knobs, '0');
+            do{
+                for ( auto j=0ul; j<num_knobs; ++j ) {
+                    temp[j] = ops[j] == '0' ? initial[j] + counters[j] :
+                                        initial[j] - counters[j];
+                }
+                check_novel(temp);
+            } while (next_binary(ops.begin(), ops.end()));
+
+            // VV: Increase inner-most loop and see if the whole process is terminated or not
+            counters[0] += 1;
+
+            for ( auto i=0ul; i<num_knobs-1; ++i ) {
+                if ( (counters[i] > constraint_max[i] - constraint_min[i] +1) ||
+                    (counters[i] > max_distance) ) {
+                    counters[i] = 0;
+                    counters[i+1] += 1;
+                }
+            }
+
+            if ( (counters[num_knobs-1] > 
+                    constraint_max[num_knobs-1] - constraint_min[num_knobs-1] +1)
+                || (counters[num_knobs-1] > max_distance))
+                done = true;
+        }
+
+        // std::cout << "Step " << candidates.size() << std::endl;
+
+        std::vector< std::vector<std::size_t> > sorted;
+
+        sorted.assign(candidates.begin(), candidates.end());
+        candidates.clear();
+
+        std::sort(sorted.begin(), sorted.end(), 
+            [initial](const auto &e1, const auto &e2) mutable -> int {
+                int64_t t;
+                std::size_t d1=0ul, d2=0ul;
+
+                for (auto i=0ul; i<e1.size(); ++i) {
+                    t = (int64_t)e1[i] - (int64_t)initial[i];
+                    d1 += t*t;
+
+                    t = (int64_t)e2[i] - (int64_t)initial[i];
+                    d2 += t*t;
+                }
+
+                return d1 < d2;
+            });
+        for (auto i=0ul; i<num_knobs; ++i)
+            initial[i] = sorted[0][i];
+    }
+}
+
+std::size_t NmdGeneric::compute_max_combinations() const
+{
+    if ( constraint_max == nullptr || constraint_min == nullptr ) {
+        return 0ul;
+    }
+    std::size_t combinations = 1;
+
+    for ( auto i=0ul; i<num_knobs; ++i )
+        combinations += constraint_max[i] - constraint_min[i] +1;
+    
+    return combinations;
+}
+
+void NmdGeneric::ensure_profile_consistency(std::size_t expected[], 
+    const std::size_t observed[]) const
+{
+    bool same = true;
+
+    for (auto i=0ul; i<num_knobs; ++i)
+        if (expected[i] != observed[i])
+            same = false;
+    if ( same == false ) {
+        OUT_INFO(
+            std::cout << "[NMD|Info] Profile does not match last suggestion, will correct: ";
+        )
+        
+        for (auto i=0ul; i<num_knobs; ++i) 
+            std::cout << expected[i] << " ";
+        
+        std::cout << " -- ";
+
+        for (auto i=0ul; i<num_knobs; ++i) 
+            std::cout << observed[i] << " ";
+        std::cout << std::endl;
+
+        for (auto i=0ul; i<num_knobs; ++i) 
+            expected[i] = observed[i];
+    }
+}
+
+void NmdGeneric::compute_centroid()
+{
+    double c[num_knobs];
+    
+    for (auto i=0ul; i<num_knobs; ++i )
+    {
+        c[i] = 0.0;
+
+        for (auto j=0ul; j<num_knobs; ++j)
+            c[i] += simplex[i][j];
+        
+        c[i] = round(c[i]/(double) num_knobs);
+    }
+    apply_constraint(c);
+
+    for (auto i=0ul; i<num_knobs; ++i)
+        centroid[i] = (std::size_t) c[i];
+}
+
+void NmdGeneric::sort_simplex(bool consult_cache)
+{
+    auto key = std::vector<std::size_t>();
+    const auto ts_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    for ( auto i = 0ul; i<num_knobs+1; ++i) {
+        key.assign(simplex[i], simplex[i]+num_knobs);
+        auto entry = cache.find(key);
+        logistics p = entry->second;
+        p.cache_ts = ts_now;
+        p.cache_dt = cache_expire_dt_ms;
+        entry->second = p;
+    }
+
+    OUT_DEBUG(
+        std::cout << "CACHE ENTRIES: "<<cache.size() << std::endl;
+        for (const auto &e:cache) {
+            for (auto i=0ul; i<num_knobs; ++i)
+                std::cout << e.second.knobs[i] << " ";
+            std::cout << ": ";
+            for (auto i=0ul; i<num_objectives; ++i)
+                std::cout << e.second.objectives[i] << " ";
+            std::cout << " = " << score(e.second.objectives.data());
+            std::cout << std::endl;
+        }
+    )
+    
+    std::vector<logistics> fresh;
+    if ( consult_cache ) {
+        for (const auto &e:cache )
+            if (ts_now - e.second.cache_ts < e.second.cache_dt)
+                fresh.push_back(e.second);
+    
+        std::sort(fresh.begin(), fresh.end(), 
+            [this](const auto &e1, const auto &e2) mutable ->int {
+                return this->score(e1.objectives.data()) < this->score(e2.objectives.data());
+            });
+    }
+
+    if ( fresh.size() >= num_knobs+1) {
+        for (auto i=0ul; i<num_knobs+1; ++i) {
+            memcpy(simplex[i], fresh[i].knobs.data(), sizeof(std::size_t)*num_knobs);
+            scores[i] = score(fresh[i].objectives.data());
+        }
+    } else {
+        std::vector< std::pair<double, std::vector<std::size_t> > > plain;
+        for ( auto i=0ul; i<num_knobs+1; ++i ) {
+            key.assign(simplex[i], simplex[i] + num_knobs);
+            plain.push_back( std::make_pair(scores[i], key));
+        }
+
+        std::sort(plain.begin(), plain.end(), 
+        [](const auto &e1, const auto &e2) mutable ->int {
+            return e1.first < e2.first;
+        });
+
+        for (auto i=0ul; i<num_knobs+1; ++i) {
+            memcpy(simplex[i], plain[i].second.data(), sizeof(std::size_t)*num_knobs);
+            scores[i] = plain[i].first;
+        }
+    }
+}
+
+std::vector<std::size_t> NmdGeneric::do_start(bool consult_cache=true)
+{
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  INNER start" << std::endl;
+    )
+    iteration ++;
+    sort_simplex(consult_cache);
+    compute_centroid();
+    double temp[num_knobs];
+
+    OUT_INFO(
+        std::cout << "[NMD|Info] Initial simplex" << std::endl;
+        for ( auto i=0ul; i<num_knobs+1; ++i) {
+            std::cout << "[NMD|Info] Score " << scores[i];
+            for ( auto j=0ul; j<num_knobs; ++j)
+                std::cout << " " << simplex[i][j];
+            std::cout << std::endl;
+        }
+
+        std::cout << "[NMD|Info] Centroid: ";
+        for (auto i=0ul; i<num_knobs; ++i)
+            std::cout << centroid[i] << " ";
+        std::cout << std::endl;
+    )
+    for (auto i=0ul; i<num_knobs; ++i)
+        temp[i] = centroid[i] + ALPHA * (centroid[i] - (double)simplex[num_knobs][i]);
+    
+    apply_constraint(temp);
+
+    for ( auto i=0ul; i<num_knobs; ++i)
+        point_reflect[i] = temp[i];
+    
+    generate_unique(point_reflect, false);
+
+    auto key = std::vector<std::size_t>();
+    key.assign(point_reflect, point_reflect + num_knobs);
+
+    auto entry = cache.find(key);
+
+    current_state = reflect;
+
+    if ( entry != cache.end() 
+        && times_reentered_start++ < 5
+        && iteration < max_iters ) {
+        auto ts_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+        if ( ts_now - entry->second.cache_ts < entry->second.cache_dt ) {
+            return do_reflect(entry->second.objectives.data(), entry->second.knobs.data());
+        }
+    }
+    
+    return key;
+}
+
+std::vector<std::size_t> NmdGeneric::do_shrink()
+{
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  INNER shrink" << std::endl;
+    )
+
+    std::set<std::vector<std::size_t> > fake;
+    std::vector<std::size_t> key;
+    
+    for ( auto i=0ul; i<num_knobs+1; ++i ) {
+        double temp[num_knobs];
+
+        for ( auto j=0ul; j<num_knobs; ++j ) {
+            temp[j] = centroid[j] + DELTA * ((double)simplex[i][j] - (double)centroid[j]);
+        }
+
+        apply_constraint(temp);
+
+        for ( auto j=0ul; j<num_knobs; ++j)
+            initial_config[i][j] = temp[j];
+        
+        generate_unique(initial_config[i], false, &fake);
+
+        key.assign(initial_config[i], initial_config[i]+num_knobs);
+        fake.insert(key);
+    }
+    
+    current_state = warmup;
+    warmup_step = 0;
+
+    OUT_INFO(
+        for (auto i=0ul; i<num_knobs+1; ++i ) {
+            std::cout << "[NMD|Info] Shrank simplex " << i << " : ";
+            for (auto j=0ul; j<num_knobs; ++j )
+                std::cout << this->initial_config[i][j] << " ";                
+            std::cout << std::endl;
+        }
+    )
+
+    return do_warmup({}, {});
+}
+
+std::vector<std::size_t> NmdGeneric::do_contract_out(const double measurements[], 
+                            std::size_t observed_knobs[])
+{
+    ensure_profile_consistency(point_contract, observed_knobs);
+    score_contract = score(measurements);
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  INNER ContractOUT: ";
+        for (auto i=0ul; i<num_knobs; ++i)
+            std::cout << point_contract[i] << " ";
+        std::cout << ":" << score_contract << std::endl;
+    )
+
+    auto ts_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    logistics entry;
+    entry.knobs.assign(observed_knobs, observed_knobs+num_knobs);
+    entry.objectives.assign(measurements, measurements+num_objectives);
+    entry.cache_dt = cache_expire_dt_ms;
+    entry.cache_ts = ts_now;
+
+    cache[entry.knobs] = entry;
+
+    if ( score_contract <= score_reflect ){
+        // VV: foc <= fr then replace v[n] with voc
+
+        for (auto i=0ul; i<num_knobs; ++i)
+            simplex[num_knobs][i] = point_contract[i];
+        
+        scores[num_knobs] = score_contract;
+        current_state = start;
+        return do_start(true);
+    } else {
+        current_state = shrink;
+        return do_shrink();
+    }
+}
+
+std::vector<std::size_t> NmdGeneric::do_contract_in(const double measurements[], 
+                            std::size_t observed_knobs[])
+{
+    ensure_profile_consistency(point_contract, observed_knobs);
+    score_contract = score(measurements);
+    
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  INNER ContractIN: ";
+        for (auto i=0ul; i<num_knobs; ++i)
+            std::cout << point_contract[i] << " ";
+        std::cout << ":" << score_contract << std::endl;
+    )
+
+    auto ts_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    logistics entry;
+    entry.knobs.assign(observed_knobs, observed_knobs+num_knobs);
+    entry.objectives.assign(measurements, measurements+num_objectives);
+    entry.cache_dt = cache_expire_dt_ms;
+    entry.cache_ts = ts_now;
+
+    cache[entry.knobs] = entry;
+
+    if ( score_contract < scores[num_knobs] ){
+        // VV: fic < f[n] then replace v[n] with vic
+
+        for (auto i=0ul; i<num_knobs; ++i)
+            simplex[num_knobs][i] = point_contract[i];
+        scores[num_knobs] = score_contract;
+        current_state = start;
+        return do_start(true);
+    } else {
+        current_state = shrink;
+        return do_shrink();
+    }
+}
+
+
+std::vector<std::size_t> NmdGeneric::do_expand(const double measurements[], 
+                            std::size_t observed_knobs[])
+{
+    ensure_profile_consistency(point_expand, observed_knobs);
+    score_expand = score(measurements);
+
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  INNER Expand: ";
+        for (auto i=0ul; i<num_knobs; ++i)
+            std::cout << point_expand[i] << " ";
+        std::cout << ":" << score_expand << std::endl;
+    )
+
+    auto ts_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    logistics entry;
+    entry.knobs.assign(observed_knobs, observed_knobs+num_knobs);
+    entry.objectives.assign(measurements, measurements+num_objectives);
+    entry.cache_dt = cache_expire_dt_ms;
+    entry.cache_ts = ts_now;
+
+    cache[entry.knobs] = entry;
+
+    if ( score_expand < score_reflect ){
+        // VV: fe < fr then replace v[n] with ve
+        for (auto i=0ul; i<num_knobs; ++i)
+            simplex[num_knobs][i] = point_expand[i];
+        scores[num_knobs] = score_expand;
+    } else {
+        // VV: fr <= fe then replace v[n] with vr
+        for (auto i=0ul; i<num_knobs; ++i)
+            simplex[num_knobs][i] = point_reflect[i];
+        scores[num_knobs] = score_reflect;
+    }
+
+    current_state = start;
+    return do_start(false);
+}
+
+std::vector<std::size_t> NmdGeneric::do_reflect(const double measurements[], 
+                            std::size_t observed_knobs[])
+{
+    ensure_profile_consistency(point_reflect, observed_knobs);
+    score_reflect = score(measurements);
+
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  INNER Reflect: ";
+        for (auto i=0ul; i<num_knobs; ++i)
+            std::cout << point_reflect[i] << " ";
+        std::cout << ":" << score_reflect << std::endl;
+    )
+
+    auto ts_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    logistics entry;
+    entry.knobs.assign(observed_knobs, observed_knobs+num_knobs);
+    entry.objectives.assign(measurements, measurements+num_objectives);
+    entry.cache_dt = cache_expire_dt_ms;
+    entry.cache_ts = ts_now;
+
+    cache[entry.knobs] = entry;
+
+    if ( score_reflect >= scores[0] && score_reflect < scores[num_knobs-1]) {
+        // VV: fo <= fr < f[n-1] then replace v[n] with vr and start over
+        for ( auto i=0ul; i<num_knobs; ++i)
+            simplex[num_knobs][i] = point_reflect[i];
+        scores[num_knobs] = score_reflect;
+        current_state = start;
+        return do_start(true);
+    } else if (score_reflect < scores[0]) {
+        double temp[num_knobs];
+        current_state = expand;
+        for (auto i=0ul; i<num_knobs; ++i)
+            temp[i] = centroid[i] + BETA * (point_reflect[i] - (double)centroid[i]);
+        
+        apply_constraint(temp);
+
+        for ( auto i=0ul; i<num_knobs; ++i)
+            point_expand[i] = temp[i];
+        generate_unique(point_expand, false);
+
+        auto key = std::vector<std::size_t>();
+        key.assign(point_expand, point_expand+num_knobs);
+        auto e = cache.find(key);
+
+        if ( e != cache.end() ) {
+            if ( ts_now - e->second.cache_ts < e->second.cache_dt ) {
+                return do_expand(e->second.objectives.data(),
+                                e->second.knobs.data());
+            }
+        }
+
+        return key;
+    } else if (scores[num_knobs-1] <= score_reflect 
+            &&  score_reflect < scores[num_knobs]) {
+        // VV: Reflect lies between f[n-1] and f[n] then contract (outside)
+        current_state = contract_out;
+        double temp[num_knobs];
+
+        for (auto i=0ul; i<num_knobs; ++i)
+            temp[i] = centroid[i] + GAMMA * (point_reflect[i] - (double)centroid[i]);
+        
+        apply_constraint(temp);
+
+        for ( auto i=0ul; i<num_knobs; ++i)
+            point_contract[i] = temp[i];
+        generate_unique(point_contract, false);
+
+        auto key = std::vector<std::size_t>();
+        key.assign(point_contract, point_contract+num_knobs);
+        auto e = cache.find(key);
+
+        if ( e != cache.end() ) {
+            if ( ts_now - e->second.cache_ts < e->second.cache_dt ) {
+                return do_contract_out(e->second.objectives.data(),
+                                e->second.knobs.data());
+            }
+        }
+
+        return key;
+    } else if (score_reflect >= scores[num_knobs]) {
+        // VV: Reflect > f[n] then contract (inside)
+        current_state = contract_in;
+        double temp[num_knobs];
+
+        for (auto i=0ul; i<num_knobs; ++i)
+            temp[i] = (double) centroid[i] - GAMMA * ((double)point_reflect[i] - (double)centroid[i]);
+        
+        apply_constraint(temp);
+
+        for ( auto i=0ul; i<num_knobs; ++i)
+            point_contract[i] = temp[i];
+        generate_unique(point_contract, false);
+
+        auto key = std::vector<std::size_t>();
+        key.assign(point_contract, point_contract+num_knobs);
+        auto e = cache.find(key);
+
+        if ( e != cache.end() ) {
+            if ( ts_now - e->second.cache_ts < e->second.cache_dt ) {
+                return do_contract_in(e->second.objectives.data(),
+                                e->second.knobs.data());
+            }
+        }
+
+        return key;
+    }
+
+    OUT_INFO(
+        std::cout << "[NMD|Info] Should never get here" << std::endl;
+    )
+
+    current_state = start;
+    return do_start(true);
+}
+
+std::vector<std::size_t> NmdGeneric::do_warmup(const double measurements[], 
+                            std::size_t observed_knobs[])
+{
+    std::vector<std::size_t> ret;
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  INNER warmup" << std::endl;
+    )
+
+    if ( warmup_step > 0 ) {
+        auto last = warmup_step - 1;
+        ensure_profile_consistency(initial_config[last], observed_knobs);
+        memcpy(simplex[last], initial_config[last], sizeof(std::size_t)*num_knobs);
+        scores[last] = score(measurements);
+        auto key = std::vector<size_t>();
+        key.assign(observed_knobs, observed_knobs+num_knobs);
+        
+        logistics entry;
+
+        entry.cache_dt = cache_expire_dt_ms;
+        entry.cache_ts = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+        entry.knobs.assign(observed_knobs, observed_knobs+num_knobs);
+        entry.objectives.assign(measurements, measurements+num_objectives);
+
+        cache[key] = entry;
+        
+        OUT_DEBUG(
+            auto s = score(measurements);
+            std::cout << "[NMD|Dbg]  Score: " << s << " for ";
+            for( auto i=0ul; i<num_knobs; ++i)
+                std::cout << observed_knobs[i] << " ";
+            std::cout << std::endl;
+        )
+    }
+
+    if ( warmup_step == num_knobs +1 ) {
+        OUT_DEBUG(
+            std::cout << "[NMD|Dbg]  Warmup results" << std::endl;
+
+            for (const auto &e:cache) {
+                for (auto i=0ul; i<num_knobs; ++i)
+                    std::cout << e.second.knobs[i] << " ";
+                std::cout << ": ";
+                for (auto i=0ul; i<num_objectives; ++i)
+                    std::cout << e.second.objectives[i] << " ";
+                std::cout << " = " << score(e.second.objectives.data());
+                std::cout << std::endl;
+            }
+        )
+        
+        current_state = start;
+        return do_start(false);
+    }
+    
+    ret.assign(this->initial_config[warmup_step],
+                this->initial_config[warmup_step]+num_knobs);
+    
+    warmup_step ++;
+
+    OUT_INFO(
+        std::cout << "[NMD|Info] Warmup Explore: ";
+        for (auto i=0ul; i<num_knobs; ++i)
+            std::cout << ret[i] << " ";
+        
+        std::cout << std::endl;
+    )
+
+    return ret;
+}
+
+std::pair<std::vector<std::size_t>, bool> NmdGeneric::get_next(const double measurements[], 
+                            std::size_t observed_knobs[])
+{
+    std::vector<std::size_t> ret;
+    #if defined(NMD_DEBUG_) || defined(NMD_INFO_)
+        const char *state_names[] = {
+            "warmup",
+            "start",
+            "reflect",
+            "expand",
+            "contract_in",
+            "contract_out",
+            "shrink"
+        };
+    #endif
+
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  Current stage " << state_names[current_state] << std::endl;
+    )
+    
+    switch (current_state) {
+        case warmup:
+            ret = do_warmup(measurements, observed_knobs);
+            break;
+        case start:
+            times_reentered_start = 0;
+            ret = do_start(true);
+            break;
+        case reflect:
+            ret = do_reflect(measurements, observed_knobs);
+            break;
+        case expand:
+            ret = do_expand(measurements, observed_knobs);
+            break;
+        case contract_in:
+            ret = do_contract_in(measurements, observed_knobs);
+            break;
+        case contract_out:
+            ret = do_contract_out(measurements, observed_knobs);
+            break;
+        case shrink:
+            ret = do_shrink();
+            break;
+        default:
+            std::cout << "Unknown state!" << std::endl;
+    }
+
+    OUT_INFO(
+        std::cout << "[NMD|Info] State " << state_names[current_state] << " proposes ";
+
+        for (auto i=0ul; i<num_knobs; ++i)
+            std::cout << ret[i] << " ";
+        std::cout << std::endl;
+    )
+
+    bool converged = false;
+    if ( current_state != warmup )
+    {
+        converged = test_convergence();
+    }
+
+    if ( converged )
+        ret.assign(simplex[0], simplex[0] + num_knobs);
+
+    return std::make_pair(ret, converged);
+}
+
+
+bool NmdGeneric::test_convergence()
+{
+    double avg, sum;
+
+    avg = 0.0;
+    sum = 0.0;
+
+    for ( auto i=0ul; i<num_knobs+1; ++i)
+        avg += scores[i];
+    
+    avg /= (num_knobs+1);
+
+    for ( auto i=0ul; i<num_knobs+1; ++i) {
+        double t = scores[i] - avg;
+        sum += t * t;        
+    }
+
+    sum /= num_knobs;
+    sum = sqrt(sum);
+
+    if (iteration >= max_iters || sum <= conv_threshold ) {
+        // if ( final_explore == false ) {
+        //     final_explore = true;
+
+        //     return false;
+        // } else {
+        //     return true;
+        // }
+        OUT_INFO(
+            std::cout << "[NMD|Info] Converged at " << sum 
+                      << " threshold: " << conv_threshold << std::endl;
+
+            std::cout << "[NMD|Info] Converged simplex" << std::endl;
+            for ( auto i=0ul; i<num_knobs+1; ++i) {
+                std::cout << "[NMD|Info] Score " << scores[i];
+                for ( auto j=0ul; j<num_knobs; ++j)
+                    std::cout << " " << simplex[i][j];
+                std::cout << std::endl;
+            }
+        )
+        return true;
+    }
+
+    return false;
+}
diff --git a/src/optimizer.cpp b/src/optimizer.cpp
index a67428f..4b29f05 100644
--- a/src/optimizer.cpp
+++ b/src/optimizer.cpp
@@ -613,13 +613,19 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
                     for (const auto &physical:active_nodes_) {
                         if ( physical ) {
                             OUT_DEBUG(
-                                std::cout << "[Ino_NMD] Node " << cur_node << " is alive!" << std::endl;
+                                std::cout << "[Ino_NMD] Node " << cur_node << " is was used last time" << std::endl;
+                            )
+                        } else {
+                            OUT_DEBUG(
+                                std::cout << "[Ino_NMD] Node " << cur_node << " was not used last time" << std::endl;
                             )
-                            virtual_to_physical.push_back(cur_node);
                         }
+
+                        virtual_to_physical.push_back(cur_node);
                         cur_node ++;
                     }
-
+                    num_active_nodes = active_nodes_.size();
+                    
                     if ( new_num_nodes > num_active_nodes )
                         new_num_nodes = num_active_nodes;
                     
@@ -778,6 +784,12 @@ hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_
 
                         previous_num_nodes = new_num_nodes;
                         hpx::lcos::broadcast_apply<allscale_optimizer_update_policy_action_ino>(localities_, new_mapping);
+
+                        for (auto i=0u; i<new_num_nodes; ++i ) {
+                            active_nodes_[i] = true;
+                        }
+                        for ( auto i=new_num_nodes ;i<active_nodes_.size(); ++i)
+                            active_nodes_[i] = false;
                     }
 
                     if ( threads_min != threads_max )

From e54f48e88dd0d4af82bcce92eeb991c55a8d7a0d Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Mon, 26 Nov 2018 16:36:26 +0000
Subject: [PATCH 32/37] Minor improvements to NmdGeneric and alternative tuned
 based on NMD

- NmdGeneric accepts a score-function that's a parameter
- Reverted default score function to speed/efficiency/power (dashboard)
- Fixed a couple of small bugs in nmsimplex_bbincr (for use with legacy
  global and local NMD optimizers which rely on time/threads/power)
---
 allscale/components/nmd.hpp              |  45 +++-
 allscale/components/nmsimplex_bbincr.hpp |  10 +-
 allscale/tuner.hpp                       |  19 ++
 src/CMakeLists.txt                       |   1 +
 src/components/localoptimizer.cpp        |   8 +-
 src/components/nmd.cpp                   |  60 ++---
 src/components/nmsimplex_bbincr.cpp      | 279 ++++++++++++-----------
 src/dashboard.cpp                        |   2 +-
 src/optimizer.cpp                        |  22 +-
 src/tuner.cpp                            | 103 +++++++++
 10 files changed, 359 insertions(+), 190 deletions(-)

diff --git a/allscale/components/nmd.hpp b/allscale/components/nmd.hpp
index 4f18ea2..7f462ff 100644
--- a/allscale/components/nmd.hpp
+++ b/allscale/components/nmd.hpp
@@ -18,10 +18,12 @@ N=2, 3, 4. Each edge may be at most @max_distance_long (see generate_unique) for
 author: vasiliadis.vasilis@gmail.com
 */
 #ifndef ALLSCALE_NMD_HEADER
+#define ALLSCALE_NMD_HEADER
 #include <cstddef>
 #include <map>
 #include <set>
 #include <vector>
+#include <cmath>
 
 namespace allscale {
 namespace components {
@@ -46,32 +48,51 @@ class NmdGeneric {
     NmdGeneric(std::size_t num_knobs, std::size_t num_objectives,
                double conv_threshold, int64_t cache_expire_dt_ms,
                std::size_t max_iters);
-    NmdGeneric(const NmdGeneric& other);
 
-    void initialize(std::size_t constraint_min[], std::size_t constraint_max[],
-                    std::size_t *initial_config[], double weights[]);
+    static double score_speed_efficiency_power(const double measurements[], const double weights[])
+    {
+        double ret = std::pow(measurements[0], weights[0]) *
+                    std::pow(measurements[1], weights[1]) *
+                    std::pow((1-measurements[2]), weights[2]);
+        
+        if ( std::isfinite(ret) == 0  || ret > 1.0 ) {
+            ret = 1.0;
+        }
+        
+        return 1.0 - ret;
+    }
+
+    void initialize(const std::size_t constraint_min[], const std::size_t constraint_max[],
+                    const std::size_t *initial_config[], const double weights[],
+                    double (*score_function)(const double[], const double []));
 
     void ensure_profile_consistency(std::size_t expected[], const std::size_t observed[]) const;
 
-    void set_constraints_now(std::size_t constraint_min[], std::size_t constraint_max[]);
+    void set_constraints_now(const std::size_t constraint_min[], 
+                             const std::size_t constraint_max[]);
 
     double score(const double measurements[]) const;
 
     std::pair<std::vector<std::size_t>, bool> get_next(const double measurements[], 
-                            std::size_t observed_knobs[]);
+                            const std::size_t observed_knobs[]);
 
-// protected:
+protected:
     bool test_convergence();
+
+    // VV: (measurements, weights) returns value in range [0.0, infinite)
+    //     0.0 means perfect score (i.e. the larger the score, the worse it is)
+    double (*score_function)(const double[], const double []);
+
     std::vector<std::size_t> do_warmup(const double measurements[], 
-                            std::size_t observed_knobs[]);
+                            const std::size_t observed_knobs[]);
     std::vector<std::size_t> do_reflect(const double measurements[], 
-                            std::size_t observed_knobs[]);
+                            const std::size_t observed_knobs[]);
     std::vector<std::size_t> do_expand(const double measurements[], 
-                            std::size_t observed_knobs[]);
+                            const std::size_t observed_knobs[]);
     std::vector<std::size_t> do_contract_in(const double measurements[], 
-                            std::size_t observed_knobs[]);
-        std::vector<std::size_t> do_contract_out(const double measurements[], 
-                            std::size_t observed_knobs[]);
+                            const std::size_t observed_knobs[]);
+    std::vector<std::size_t> do_contract_out(const double measurements[], 
+                            const std::size_t observed_knobs[]);
     std::vector<std::size_t> do_shrink();
     std::vector<std::size_t> do_start(bool consult_cache);
 
diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp
index 11e5d09..891938d 100644
--- a/allscale/components/nmsimplex_bbincr.hpp
+++ b/allscale/components/nmsimplex_bbincr.hpp
@@ -81,7 +81,8 @@ enum iterationstates
 	start,
 	reflection,
 	expansion,
-	contraction,
+	contraction_in,
+	contraction_out,
 	shrink
 };
 
@@ -165,9 +166,12 @@ class NelderMead
 			double knob1, double knob2);
 	optstepresult do_step_expand(const double objectives[], 
 			double knob1, double knob2);
-	optstepresult do_step_contract(const double objectives[], 
+	optstepresult do_step_contract_in(const double objectives[], 
 			double knob1, double knob2);
-	optstepresult do_step_shrink(const double objectives[], 
+	optstepresult do_step_contract_out(const double objectives[], 
+			double knob1, double knob2);
+	optstepresult do_step_shrink();
+	optstepresult do_step_warmup(const double objectives[], 
 			double knob1, double knob2);
 
 	void sort_vertices(void);
diff --git a/allscale/tuner.hpp b/allscale/tuner.hpp
index da28253..f1285a8 100644
--- a/allscale/tuner.hpp
+++ b/allscale/tuner.hpp
@@ -3,6 +3,7 @@
 #define ALLSCALE_TUNER_HPP
 
 #include <allscale/tuning_objective.hpp>
+#include <allscale/components/nmd.hpp>
 
 #include <iostream>
 #include <vector>
@@ -74,6 +75,24 @@ namespace allscale {
 
         void next_direction();
     };
+
+    struct nmd_optimizer : tuner
+    {
+        nmd_optimizer(std::size_t nodes_min, std::size_t nodes_max);
+        components::NmdGeneric nmd;
+        std::vector<std::size_t> avail_freqs;
+        std::vector<std::size_t> best;
+        bool converged;
+        bool initialized;
+        // VV: even though NmdGeneric supports arbitrary number of optimization parameters
+        //     we're applying it to number of nodes and CPU frequency, it is trivial to 
+        //     add number of threads
+        std::size_t constraint_min[2], constraint_max[2];
+
+        tuner_configuration next(tuner_configuration const& current_cfg, tuner_state const& current_state, tuning_objective) override;
+
+        double previous_weights[3];
+    };
 }
 
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 25cf7c9..1481fbf 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -25,6 +25,7 @@ set(_srcs
     components/util/graph_colouring.cpp
     components/localoptimizer.cpp
     components/nmsimplex_bbincr.cpp
+    components/nmd.cpp
 )
 
 if(CPUFREQ_FOUND)
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index d002112..52989d8 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -189,7 +189,7 @@ void localoptimizer::setmaxthreads(std::size_t threads)
 			if ( min_threads < 1 )
 				min_threads = 1;
 			
-			double constraint_min[] = {min_threads, min_freq};
+			double constraint_min[] = {(double) min_threads, (double) min_freq};
 			#if defined(ALLSCALE_HAVE_CPUFREQ)
 			double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
 									(double)max_freq};
@@ -226,7 +226,7 @@ void localoptimizer::initialize_nmd(bool from_scratch)
 		min_threads = 1;
 	int max_threads = max_threads_;
 
-	double constraint_min[] = {min_threads, min_freq};
+	double constraint_min[] = { (double) min_threads, (double) min_freq};
 	#if defined(ALLSCALE_HAVE_CPUFREQ)
 	double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
 							(double)max_freq};
@@ -251,7 +251,7 @@ void localoptimizer::initialize_nmd(bool from_scratch)
 	} else {
 		if ( time_weight >= energy_weight + resource_weight ) {
 			double initial_simplex[3][2] = {
-				{min_threads, constraint_min[1]},
+				{(double) min_threads, constraint_min[1]},
 				{max_threads/2.0, (constraint_min[1]+constraint_max[1])/2.0},
 				{(min_threads+max_threads)/2., constraint_max[1]}
 			};
@@ -261,7 +261,7 @@ void localoptimizer::initialize_nmd(bool from_scratch)
 									constraint_max);
 		} else {
 			double initial_simplex[3][2] = {
-				{min_threads, constraint_min[1]},
+				{(double) min_threads, constraint_min[1]},
 				{max_threads/2.0, (constraint_min[1]+constraint_max[1])/2.0},
 				{(min_threads+max_threads)/2., constraint_max[1]}
 			};
diff --git a/src/components/nmd.cpp b/src/components/nmd.cpp
index 9bae1cf..5893b5b 100644
--- a/src/components/nmd.cpp
+++ b/src/components/nmd.cpp
@@ -7,8 +7,8 @@
 #include <allscale/components/nmd.hpp>
 
 
-// #define NMD_DEBUG_
-// #define NMD_INFO_
+#define NMD_DEBUG_
+#define NMD_INFO_
 
 #ifdef NMD_DEBUG_
 #define OUT_DEBUG(X) X
@@ -68,35 +68,30 @@ max_iters(max_iters)
     weights = new double [num_objectives];
 }
 
-NmdGeneric::NmdGeneric(const NmdGeneric& other)
-{
-
-}
-
 double NmdGeneric::score(const double measurements[]) const
 {
-    double ret = std::pow(measurements[0], weights[0]) *
-                std::pow(measurements[1], weights[1]) *
-                std::pow((1-measurements[2]), weights[2]);
-    
-    if ( std::isfinite(ret) == 0  || ret > 1.0 ) {
-        ret = 1.0;
-    }
-    return 1.0 - ret;
+    return (*score_function)(measurements, weights);
 }
 
-void NmdGeneric::initialize(std::size_t constraint_min[], std::size_t constraint_max[],
-                    std::size_t *initial_config[], double weights[])
+void NmdGeneric::initialize(const std::size_t constraint_min[], 
+                            const std::size_t constraint_max[],
+                            const std::size_t *initial_config[], 
+                            const double weights[], double (*score_function)(const double[], const double []))
 {
     for (auto i=0ul; i<num_objectives; ++i)
         this->weights[i] = weights[i];
+    
+    this->score_function = score_function;
 
     set_constraints_now(constraint_min, constraint_max);
 
     iteration = 0;
-
     if ( initial_config == nullptr ) {
         std::set<std::vector<std::size_t> > fake;
+        
+        OUT_INFO(
+            std::cout << "[NMD|Info] Generating initial config for " << num_knobs << std::endl;
+        )
 
         for (auto i=0ul; i<num_knobs+1; ++i ) {
             for ( auto j=0ul; j<num_knobs; ++j ) {
@@ -104,7 +99,7 @@ void NmdGeneric::initialize(std::size_t constraint_min[], std::size_t constraint
                 this->initial_config[i][j] = std::rand() % width + constraint_min[j];
             }
 
-            generate_unique(this->initial_config[i], true, &fake);
+            generate_unique(this->initial_config[i], false, &fake);
             auto new_key = std::vector<std::size_t>();
             new_key.assign(this->initial_config[i], this->initial_config[i]+num_knobs);
             fake.insert(new_key);
@@ -131,8 +126,8 @@ void NmdGeneric::initialize(std::size_t constraint_min[], std::size_t constraint
     times_reentered_start = 0;
 }
 
-void NmdGeneric::set_constraints_now(std::size_t constraint_min[],
-                                    std::size_t constraint_max[])
+void NmdGeneric::set_constraints_now(const std::size_t constraint_min[],
+                                    const std::size_t constraint_max[])
 {
     for (auto i=0ul; i<num_knobs; ++i ){
         this->constraint_max[i] = constraint_max[i];
@@ -170,6 +165,11 @@ void NmdGeneric::generate_unique(std::size_t initial[], bool accept_stale=false,
                 if ( entry == cache.end() ) {
                     candidates.insert(key);
                 } else {
+                    std::cout << "Found ";
+                    for (auto i=0ul; i<num_knobs; ++i ) {
+                        std::cout << key[i] << " ";
+                    }
+                    std::cout << std::endl;
                     auto dt = ts_now - entry->second.cache_ts;
                     if (accept_stale==false || 
                         (dt >= entry->second.cache_dt && cache_expire_dt_ms > 0) ) {
@@ -366,7 +366,7 @@ std::vector<std::size_t> NmdGeneric::do_start(bool consult_cache=true)
         std::cout << "[NMD|Dbg]  INNER start" << std::endl;
     )
     iteration ++;
-    sort_simplex(consult_cache);
+    sort_simplex(false);
     compute_centroid();
     double temp[num_knobs];
 
@@ -457,7 +457,7 @@ std::vector<std::size_t> NmdGeneric::do_shrink()
 }
 
 std::vector<std::size_t> NmdGeneric::do_contract_out(const double measurements[], 
-                            std::size_t observed_knobs[])
+                            const std::size_t observed_knobs[])
 {
     ensure_profile_consistency(point_contract, observed_knobs);
     score_contract = score(measurements);
@@ -494,7 +494,7 @@ std::vector<std::size_t> NmdGeneric::do_contract_out(const double measurements[]
 }
 
 std::vector<std::size_t> NmdGeneric::do_contract_in(const double measurements[], 
-                            std::size_t observed_knobs[])
+                            const std::size_t observed_knobs[])
 {
     ensure_profile_consistency(point_contract, observed_knobs);
     score_contract = score(measurements);
@@ -532,7 +532,7 @@ std::vector<std::size_t> NmdGeneric::do_contract_in(const double measurements[],
 
 
 std::vector<std::size_t> NmdGeneric::do_expand(const double measurements[], 
-                            std::size_t observed_knobs[])
+                            const std::size_t observed_knobs[])
 {
     ensure_profile_consistency(point_expand, observed_knobs);
     score_expand = score(measurements);
@@ -571,7 +571,7 @@ std::vector<std::size_t> NmdGeneric::do_expand(const double measurements[],
 }
 
 std::vector<std::size_t> NmdGeneric::do_reflect(const double measurements[], 
-                            std::size_t observed_knobs[])
+                            const std::size_t observed_knobs[])
 {
     ensure_profile_consistency(point_reflect, observed_knobs);
     score_reflect = score(measurements);
@@ -688,7 +688,7 @@ std::vector<std::size_t> NmdGeneric::do_reflect(const double measurements[],
 }
 
 std::vector<std::size_t> NmdGeneric::do_warmup(const double measurements[], 
-                            std::size_t observed_knobs[])
+                            const std::size_t observed_knobs[])
 {
     std::vector<std::size_t> ret;
     OUT_DEBUG(
@@ -757,7 +757,7 @@ std::vector<std::size_t> NmdGeneric::do_warmup(const double measurements[],
 }
 
 std::pair<std::vector<std::size_t>, bool> NmdGeneric::get_next(const double measurements[], 
-                            std::size_t observed_knobs[])
+                            const std::size_t observed_knobs[])
 {
     std::vector<std::size_t> ret;
     #if defined(NMD_DEBUG_) || defined(NMD_INFO_)
@@ -817,8 +817,10 @@ std::pair<std::vector<std::size_t>, bool> NmdGeneric::get_next(const double meas
         converged = test_convergence();
     }
 
-    if ( converged )
+    if ( converged ) {
+        sort_simplex(true);
         ret.assign(simplex[0], simplex[0] + num_knobs);
+    }
 
     return std::make_pair(ret, converged);
 }
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 851156d..430afac 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -857,7 +857,7 @@ optstepresult NelderMead::do_step_reflect(const double objectives[],
         res.threads = vc[0];
         res.freq_idx = vc[1];
 
-        state_ = contraction;
+        state_ = contraction_out;
 
         auto key = std::make_pair(res.threads, res.freq_idx);
 
@@ -870,7 +870,7 @@ optstepresult NelderMead::do_step_reflect(const double objectives[],
 
             if (dt < entry->second._cache_expires_dt)
             {
-                return do_step_contract(entry->second.objectives,
+                return do_step_contract_out(entry->second.objectives,
                     entry->second.threads,
                     entry->second.freq_idx);
             }
@@ -896,7 +896,7 @@ optstepresult NelderMead::do_step_reflect(const double objectives[],
         res.threads = vc[0];
         res.freq_idx = vc[1];
 
-        state_ = contraction;
+        state_ = contraction_in;
         auto key = std::make_pair(res.threads, res.freq_idx);
 
         auto entry = cache_.find(key);
@@ -908,7 +908,7 @@ optstepresult NelderMead::do_step_reflect(const double objectives[],
 
             if (dt < entry->second._cache_expires_dt)
             {
-                return do_step_contract(entry->second.objectives,
+                return do_step_contract_in(entry->second.objectives,
                     entry->second.threads,
                     entry->second.freq_idx);
             }
@@ -976,12 +976,12 @@ optstepresult NelderMead::do_step_expand(const double objectives[],
     return do_step_start();
 }
 
-optstepresult NelderMead::do_step_contract(const double objectives[],
+optstepresult NelderMead::do_step_contract_in(const double objectives[],
     double knob1, double knob2)
 {
     int j;
 #ifdef NMD_DEBUG_
-    std::cout << "[NelderMead|DEBUG] State = Contraction" << std::endl;
+    std::cout << "[NelderMead|DEBUG] State = ContractionIN" << std::endl;
 #endif
     fc = evaluate_score(objectives, nullptr);
 
@@ -1007,11 +1007,11 @@ optstepresult NelderMead::do_step_contract(const double objectives[],
         cache_update((int)vc[0], (int)vc[1], objectives, true);
     }
 
-    if (fc <= fr)
+    if (fc <= f[NMD_NUM_KNOBS])
     {
-        // VV: CONTRACTED_O is better than REFLECTED
-        //     Replace WORST with CONTRACTED_O
-        for (j = 0; j <= n - 1; j++)
+        // VV: CONTRACTED_I is better than WORST
+        //     Replace WORST with CONTRACTED_I
+        for (j = 0; j < NMD_NUM_KNOBS; j++)
         {
             v[vg][j] = vc[j];
         }
@@ -1025,87 +1025,159 @@ optstepresult NelderMead::do_step_contract(const double objectives[],
     }
     else
     {
-        // VV: Replace SECOND BEST
-        double new_vh[NMD_NUM_KNOBS];
-        
-        auto gen_new = [this, &new_vh](double *extra) mutable -> double* {
-            for (auto j = 0; j < NMD_NUM_KNOBS; j++)
-                new_vh[j] = v[vs][j] + DELTA * (v[vh][j] - v[vs][j]) - extra[j];
-                
-            my_constraints(new_vh);
-
-            return new_vh;
-        };
-
-        generate_new(gen_new);
-
-        for (j = 0; j < NMD_NUM_KNOBS; j++)
-            v[vh][j] = new_vh[j];
-
-        // VV: Now evaluate SHRINK
-
-        optstepresult res;
-        res.threads = v[vh][0];
-        res.freq_idx = v[vh][1];
         state_ = shrink;
-
-        auto key = std::make_pair(res.threads, res.freq_idx);
-
-        auto entry = cache_.find(key);
-
-        if (entry != cache_.end())
-        {
-            auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
-            auto dt = timestamp_now - entry->second._cache_timestamp;
-
-            if (dt < entry->second._cache_expires_dt)
-            {
-                return do_step_shrink(entry->second.objectives, 
-                                        entry->second.threads, 
-                                        entry->second.freq_idx);
-            }
-        }
-
-        return res;
+        return do_step_shrink();
     }
 }
 
-optstepresult NelderMead::do_step_shrink(const double objectives[], 
-            double knob1, double knob2)
+optstepresult NelderMead::do_step_contract_out(const double objectives[],
+    double knob1, double knob2)
 {
+    int j;
 #ifdef NMD_DEBUG_
-    std::cout << "[NelderMead|DEBUG] State = Shrink" << std::endl;
+    std::cout << "[NelderMead|DEBUG] State = ContractionOUT" << std::endl;
 #endif
-    f[vh] = evaluate_score(objectives, nullptr);
+    fc = evaluate_score(objectives, nullptr);
 
     double profiled[] = {knob1, knob2};
     my_constraints(profiled);
 
-    if ( v[vh][0] != profiled[0] || v[vh][1] != profiled[1] ) {
-        std::cout << "[NelderMead|WARN] Meant to profile shrink " << v[vh][0] << " knob1 "
+    if ( vc[0] != profiled[0] || vc[1] != profiled[1] ) {
+        std::cout << "[NelderMead|WARN] Meant to profile contract " << vc[0] << " knob1 "
                      "but ended up using " << profiled[0] << std::endl;
-        std::cout << "[NelderMead|WARN] Meant to profile shrink " << v[vh][1] << " knob2 "
+        std::cout << "[NelderMead|WARN] Meant to profile contract " << vc[1] << " knob2 "
                      "but ended up using " << profiled[1] << std::endl;
         
-        auto key = std::make_pair((int)v[vh][0], (int)v[vh][1]);
+        auto key = std::make_pair((int)vc[0], (int)vc[1]);
         auto iter = cache_.find(key);
         if ( iter != cache_.end() ) {
             iter->second.threads = profiled[0];
             iter->second.freq_idx = profiled[1];
         }
 
-        v[vh][0] = profiled[0];
-        v[vh][1] = profiled[1];
+        vc[0] = profiled[0];
+        vc[1] = profiled[1];
 
-        cache_update((int)v[vh][0], (int)v[vh][1], objectives, true);
+        cache_update((int)vc[0], (int)vc[1], objectives, true);
     }
 
-    const int threads = (int)(v[vh][0]);
-    const int freq_idx = (int)(v[vh][1]);
+    if (fc <= fr)
+    {
+        // VV: CONTRACTED_O is better than REFLECTED
+        //     Replace WORST with CONTRACTED_O
+        for (j = 0; j < NMD_NUM_KNOBS; j++)
+        {
+            v[vg][j] = vc[j];
+        }
+        f[vg] = fc;
 
-    cache_update(threads, freq_idx, objectives, true);
+        const int threads = (int)(v[vg][0]);
+        const int freq_idx = (int)(v[vg][1]);
 
-    return do_step_start();
+        cache_update(threads, freq_idx, objectives, true);
+        return do_step_start();
+    }
+    else
+    {
+        state_ = shrink;
+        return do_step_shrink();
+    }
+}
+
+optstepresult NelderMead::do_step_shrink()
+{
+#ifdef NMD_DEBUG_
+    std::cout << "[NelderMead|DEBUG] State = Shrink" << std::endl;
+#endif
+    for (auto i=0ul; i<NMD_NUM_KNOBS+1; ++i) {
+        auto gen_new = [this, i](double *extra) mutable -> double* {      
+            for (j = 0; j < NMD_NUM_KNOBS; j++)
+                vr[j] = vm[j] + DELTA * (v[i][j] - vm[j]) - extra[j];
+        
+            my_constraints(vr);
+
+            return vr;
+        };
+        
+        generate_new(gen_new);
+    }
+
+    state_ = warmup;
+    warming_up_step = 0;
+    return do_step_warmup({}, 0, 0);
+}
+
+optstepresult NelderMead::do_step_warmup(const double objectives[],
+            double knob1, double knob2)
+{
+    #ifdef NMD_DEBUG_
+        std::cout << "[NelderMead|DEBUG] State = Warmup " 
+                    << warming_up_step << std::endl;
+    #endif
+
+    OUT_DEBUG(
+        if ( warming_up_step == 0 ) {
+            std::cout << "[NelderMead|DEBUG] Initial exploration" << std::endl;
+
+            for ( auto i =0; i<NMD_NUM_KNOBS+1; ++i ) {
+                std::cout << "Simplex[" << i <<"]:";
+                for ( auto j=0; j<NMD_NUM_KNOBS; ++j )
+                    std::cout << " " << initial_configurations[i][j];
+                std::cout << std::endl;
+            }
+        }
+    )
+
+    // VV: Make sure that we actually profiled what we meant to
+    if ( warming_up_step > 0 && warming_up_step <= NMD_NUM_KNOBS + 1) {
+        double profiled[] = {knob1, knob2};
+        my_constraints(profiled);
+
+        if ( v[warming_up_step-1][0] != profiled[0] || v[warming_up_step-1][1] != profiled[1] ) {
+            std::cout << "[NelderMead|WARN] Meant to profile expand " << v[warming_up_step-1][0] << " knob1 "
+                        "but ended up using " << profiled[0] << std::endl;
+            std::cout << "[NelderMead|WARN] Meant to profile expand " << v[warming_up_step-1][1] << " knob2 "
+                        "but ended up using " << profiled[1] << std::endl;
+            
+            auto key = std::make_pair((int)v[warming_up_step-1][0], (int)v[warming_up_step-1][1]);
+            auto iter = cache_.find(key);
+            if ( iter != cache_.end() ) {
+                iter->second.threads = profiled[0];
+                iter->second.freq_idx = profiled[1];
+            }
+
+            v[warming_up_step-1][0] = profiled[0];
+            v[warming_up_step-1][1] = profiled[1];
+        }
+        
+        // VV: Record results of last warming up step
+        f[warming_up_step-1] = evaluate_score(objectives, nullptr);
+        cache_update(v[warming_up_step-1][0], v[warming_up_step-1][1], 
+                        objectives, true);
+    } 
+
+    if ( warming_up_step == NMD_NUM_KNOBS + 1) {
+        // VV: We need not explore the knob_set space anymore
+        state_ = start;
+        return step(objectives, knob1, knob2);
+    } else if (warming_up_step > NMD_NUM_KNOBS + 1) {
+        std::cout << "[NelderMead|Warn] Unknown warmup step " << warming_up_step << std::endl;
+    }
+    optstepresult res;
+    
+    res.objectives[0] = -1;
+    res.objectives[1] = -1;
+    res.objectives[2] = -1;
+    res.converged = false;
+
+    res.threads = initial_configurations[warming_up_step][0];
+    res.freq_idx = initial_configurations[warming_up_step][1];
+    
+    v[warming_up_step][0] = res.threads;
+    v[warming_up_step][1] = res.freq_idx;
+    warming_up_step++;
+
+    return res;
 }
 
 optstepresult NelderMead::step(const double objectives[], 
@@ -1161,72 +1233,7 @@ optstepresult NelderMead::step(const double objectives[],
     {
     case warmup:
     {
-        #ifdef NMD_DEBUG_
-            std::cout << "[NelderMead|DEBUG] State = Warmup " 
-                      << warming_up_step << std::endl;
-        #endif
-
-        OUT_DEBUG(
-            if ( warming_up_step == 0 ) {
-                std::cout << "[NelderMead|DEBUG] Initial exploration" << std::endl;
-
-                for ( auto i =0; i<NMD_NUM_KNOBS+1; ++i ) {
-                    std::cout << "Simplex[" << i <<"]:";
-                    for ( auto j=0; j<NMD_NUM_KNOBS; ++j )
-                        std::cout << " " << initial_configurations[i][j];
-                    std::cout << std::endl;
-                }
-            }
-        )
-
-        // VV: Make sure that we actually profiled what we meant to
-        if ( warming_up_step > 0 && warming_up_step <= NMD_NUM_KNOBS + 1) {
-            double profiled[] = {knob1, knob2};
-            my_constraints(profiled);
-
-            if ( v[warming_up_step-1][0] != profiled[0] || v[warming_up_step-1][1] != profiled[1] ) {
-                std::cout << "[NelderMead|WARN] Meant to profile expand " << v[warming_up_step-1][0] << " knob1 "
-                            "but ended up using " << profiled[0] << std::endl;
-                std::cout << "[NelderMead|WARN] Meant to profile expand " << v[warming_up_step-1][1] << " knob2 "
-                            "but ended up using " << profiled[1] << std::endl;
-                
-                auto key = std::make_pair((int)v[warming_up_step-1][0], (int)v[warming_up_step-1][1]);
-                auto iter = cache_.find(key);
-                if ( iter != cache_.end() ) {
-                    iter->second.threads = profiled[0];
-                    iter->second.freq_idx = profiled[1];
-                }
-
-                v[warming_up_step-1][0] = profiled[0];
-                v[warming_up_step-1][1] = profiled[1];
-            }
-            
-            // VV: Record results of last warming up step
-            f[warming_up_step-1] = evaluate_score(objectives, nullptr);
-            cache_update(v[warming_up_step-1][0], v[warming_up_step-1][1], 
-                         objectives, true);
-        } 
-
-        if ( warming_up_step == NMD_NUM_KNOBS + 1) {
-            // VV: We need not explore the knob_set space anymore
-            state_ = start;
-            return step(objectives, knob1, knob2);
-        } else if (warming_up_step > NMD_NUM_KNOBS + 1) {
-            std::cout << "[NelderMead|Warn] Unknown warmup step " << warming_up_step << std::endl;
-        }
-
-        res.objectives[0] = -1;
-        res.objectives[1] = -1;
-        res.objectives[2] = -1;
-        res.converged = false;
-
-        res.threads = initial_configurations[warming_up_step][0];
-        res.freq_idx = initial_configurations[warming_up_step][1];
-        
-        v[warming_up_step][0] = res.threads;
-        v[warming_up_step][1] = res.freq_idx;
-        warming_up_step++;
-
+        res = do_step_warmup(objectives, knob1, knob2);
         break;
     }
     break;
@@ -1240,11 +1247,11 @@ optstepresult NelderMead::step(const double objectives[],
     case expansion:
         res = do_step_expand(objectives, knob1, knob2);
         break;
-    case contraction:
-        res = do_step_contract(objectives, knob1, knob2);
+    case contraction_in:
+        res = do_step_contract_in(objectives, knob1, knob2);
         break;
-    case shrink:
-        res = do_step_shrink(objectives, knob1, knob2);
+    case contraction_out:
+        res = do_step_contract_out(objectives, knob1, knob2);
         break;
     default:
         std::cout << "Unknown NelderMead state (" << state_ << ")" << std::endl;
diff --git a/src/dashboard.cpp b/src/dashboard.cpp
index 52a6890..8de511f 100644
--- a/src/dashboard.cpp
+++ b/src/dashboard.cpp
@@ -24,7 +24,7 @@
 
 
 // VV: Define this to use time/energy/resources instead of speed/energy/efficiency
-#define ALTERNATIVE_SCORE 
+// #define ALTERNATIVE_SCORE 
 
 namespace allscale { namespace dashboard
 {
diff --git a/src/optimizer.cpp b/src/optimizer.cpp
index 4b29f05..389aa5e 100644
--- a/src/optimizer.cpp
+++ b/src/optimizer.cpp
@@ -198,7 +198,7 @@ float estimate_power(float frequency)
 
 global_optimizer::global_optimizer()
     : u_balance_every(10), u_steps_till_rebalance(u_balance_every),
-    active_nodes_(allscale::get_num_localities(), true), tuner_(new simple_coordinate_descent(tuner_configuration{active_nodes_, allscale::monitor::get().get_current_freq(0)})),
+    active_nodes_(allscale::get_num_localities(), true),
     objective_(get_default_objective()),
     active_(true), localities_(hpx::find_all_localities()),
     f_resource_max(-1.0f), f_resource_leeway(-1.0f), 
@@ -208,6 +208,8 @@ global_optimizer::global_optimizer()
     last_optimization_score(1.0)
 {
     char *const c_policy = std::getenv("ALLSCALE_SCHEDULING_POLICY");
+    char *const c_tuner = std::getenv("ALLSCALE_TUNER");
+
     std::string input_objective_str =
       hpx::get_config_entry("allscale.objective", "");
     
@@ -237,12 +239,13 @@ global_optimizer::global_optimizer()
             f_resource_max = atof(c_resource_max);
         
         nodes_min = f_resource_leeway * localities_.size();
-        nodes_max = localities_.size();
-
-        if ( nodes_min < 1 )
-            nodes_min = 1;
     }
 
+    nodes_max = localities_.size();
+
+    if ( nodes_min < 1 )
+        nodes_min = 1;
+
     if ( c_policy && strcasecmp(c_policy, "ino"))
         o_ino = allscale::components::internode_optimizer_t(localities_.size(),
                                                             (double) f_resource_max,
@@ -264,6 +267,15 @@ global_optimizer::global_optimizer()
     objectives_scale[0] = 0.5;
     objectives_scale[1] = 1.0;
     objectives_scale[2] = 1.0;
+
+    if (c_policy && strcasecmp(c_policy, "neldermead")) {
+        std::cout << "Choosing NelderMead Optimizer for global optimization" << std::endl;
+        tuner_ = std::make_unique<nmd_optimizer>(nodes_min, nodes_max);
+    }
+    else {
+        std::cout << "Choosing Coordinate Descent Optimizer for global optimization" << std::endl;
+        tuner_ = std::make_unique<simple_coordinate_descent>(tuner_configuration{active_nodes_, allscale::monitor::get().get_current_freq(0)});
+    }
 }
 
 double global_optimizer::get_optimization_score()
diff --git a/src/tuner.cpp b/src/tuner.cpp
index 546a2be..f3d421e 100644
--- a/src/tuner.cpp
+++ b/src/tuner.cpp
@@ -4,6 +4,8 @@
 #include <allscale/components/monitor.hpp>
 #include <allscale/utils/printer/vectors.h>
 #include <allscale/utils/optional.h>
+#include <algorithm>
+
 
 namespace allscale {
     std::ostream& operator<<(std::ostream& os, tuner_configuration const& cfg)
@@ -204,4 +206,105 @@ namespace allscale {
         // print a status message
         std::cerr << "New search direction: " << (dim == num_nodes ? "#nodes" : "frequency") << " " << (dir == up ? "up" : "down") << "\n";
     }
+
+    nmd_optimizer::nmd_optimizer(std::size_t nodes_min, 
+                                 std::size_t nodes_max)
+    : nmd(2, 3, 0.01, 2000, 50ul)
+    , converged(false)
+    {
+        constraint_min[0] = nodes_min;
+        constraint_max[0] = nodes_max;
+
+        avail_freqs = monitor::get().get_available_freqs(0);
+        std::sort(avail_freqs.begin(), avail_freqs.end());
+
+        if ( avail_freqs.size() ) {
+            constraint_min[1] = 0;
+            constraint_max[1] = avail_freqs.size() - 1;
+        } else {
+            constraint_min[1] = 0;
+            constraint_max[1] = 0;
+        }
+
+        previous_weights[0] = 0;
+        previous_weights[1] = 0;
+        previous_weights[2] = 0;
+    }
+
+    tuner_configuration nmd_optimizer::next(tuner_configuration const& current_cfg, tuner_state const& current_state, tuning_objective obj)
+    {
+        tuner_configuration res;
+        auto action = std::vector<std::size_t>();
+        std::cout << "Initializing NMD" << std::endl;
+
+        const double weights[] = {
+                obj.speed_exponent, obj.efficiency_exponent, obj.power_exponent
+        };
+
+        double diff = 0.0;
+
+        for (auto i=0ul; i<3; ++i)
+            diff += abs(previous_weights[i] - weights[i]);
+
+        if ( diff > 0.01 ) {
+            // VV: Enforce exploration
+            initialized = false;
+            this->converged = false;
+        }
+
+        for (auto i=0ul; i<3; ++i)
+            previous_weights[i] = weights[i];
+
+        if ( initialized == false ){
+            nmd.initialize(constraint_min, 
+                            constraint_max, 
+                            nullptr, 
+                            weights,
+                            &nmd.score_speed_efficiency_power);
+            initialized = true;
+        }
+        
+        if ( this->converged == false ) {
+            double measurements[3] = {current_state.speed, current_state.efficiency, current_state.power};
+
+            std::size_t num_active_nodes = std::count(current_cfg.node_mask.begin(),
+                        current_cfg.node_mask.end(), 
+                        true);
+            std::size_t freq_idx;
+            auto e = std::find(avail_freqs.begin(), avail_freqs.end(), current_cfg.frequency);
+
+            if ( e == avail_freqs.end() )
+                freq_idx = 0;
+            else
+                freq_idx = e - avail_freqs.begin();
+
+            const std::size_t observed[] = {num_active_nodes, freq_idx};
+            auto ret = nmd.get_next(measurements, observed);
+            action.assign(ret.first.begin(), ret.first.end());
+            auto converged = ret.second;
+
+            if (converged) {
+                best.assign(action.begin(), action.end());
+                this->converged = true;
+            }
+        } else {
+            action.assign(best.begin(), best.end());
+        }
+
+        res.node_mask.assign(current_cfg.node_mask.begin(), 
+                            current_cfg.node_mask.end());
+        
+        for (auto i=0ul; i<action[0]; ++i)
+                res.node_mask[i] = true;
+        for (auto i=action[0]; i<res.node_mask.size(); ++i)
+            res.node_mask[i] = false;
+        
+        res.frequency = action[1];
+
+        if ( avail_freqs.size() ) {
+            res.frequency = avail_freqs[action[1]];
+        }
+
+        return res;
+    }
 }

From 42ee93eeb39e3977426510e9609f195b4a90a2a2 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Mon, 26 Nov 2018 17:02:31 +0000
Subject: [PATCH 33/37] Disabled verbose logging for NMD implementations

---
 src/components/nmd.cpp              | 4 ++--
 src/components/nmsimplex_bbincr.cpp | 3 +--
 src/tuner.cpp                       | 1 -
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/components/nmd.cpp b/src/components/nmd.cpp
index 5893b5b..bb59b1a 100644
--- a/src/components/nmd.cpp
+++ b/src/components/nmd.cpp
@@ -7,8 +7,8 @@
 #include <allscale/components/nmd.hpp>
 
 
-#define NMD_DEBUG_
-#define NMD_INFO_
+//#define NMD_DEBUG_
+//#define NMD_INFO_
 
 #ifdef NMD_DEBUG_
 #define OUT_DEBUG(X) X
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 430afac..82ae4e9 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -14,8 +14,7 @@
 #include <allscale/components/nmsimplex_bbincr.hpp>
 #include <cmath>
 
-#define NMD_DEBUG_ 1
-#define NMD_INFO_ 1
+//#define NMD_DEBUG_ 1
 
 #ifdef NMD_DEBUG_
 #define OUT_DEBUG(X) X
diff --git a/src/tuner.cpp b/src/tuner.cpp
index f3d421e..2203687 100644
--- a/src/tuner.cpp
+++ b/src/tuner.cpp
@@ -235,7 +235,6 @@ namespace allscale {
     {
         tuner_configuration res;
         auto action = std::vector<std::size_t>();
-        std::cout << "Initializing NMD" << std::endl;
 
         const double weights[] = {
                 obj.speed_exponent, obj.efficiency_exponent, obj.power_exponent

From 65f6a771163ae0706d985db80d2003e3971ae907 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Tue, 27 Nov 2018 09:28:51 +0000
Subject: [PATCH 34/37] Disabled more debug printouts

(re-enable #defines to get the functionality back)
---
 allscale/components/localoptimizer.hpp |  6 +++---
 src/components/localoptimizer.cpp      | 11 ++++++-----
 src/components/scheduler_component.cpp |  4 ++--
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index 96d1f5f..9e11ebb 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -15,9 +15,9 @@
 #include <string>
 
 //#define MEASURE_MANUAL_ 1
-#define MEASURE_ 1
-#define DEBUG_ 1
-#define DEBUG_MULTIOBJECTIVE_ 1
+// #define MEASURE_ 1
+// #define DEBUG_ 1
+// #define DEBUG_MULTIOBJECTIVE_ 1
 
 namespace allscale
 {
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index 52989d8..da51330 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -10,12 +10,12 @@
 #include <stdlib.h>
 #include <stdexcept>
 
-#define DEBUG_ 1
+//#define DEBUG_ 1
 //#define DEBUG_INIT_ 1 // define to generate output during scheduler initialization
-#define DEBUG_MULTIOBJECTIVE_ 1
-#define DEBUG_CONVERGENCE_ 1
+//#define DEBUG_MULTIOBJECTIVE_ 1
+//#define DEBUG_CONVERGENCE_ 1
 //#define MEASURE_MANUAL 1 // define to generate output consumed by the regression test
-#define MEASURE_ 1
+//#define MEASURE_ 1
 // only meant to be defined if one needs to measure the efficacy
 // of the scheduler
 //#define ALLSCALE_HAVE_CPUFREQ 1
@@ -111,7 +111,7 @@ bool localoptimizer::isConverged()
 	#endif 
 	return converged_; 
 }
-
+#ifdef DEBUG_
 void localoptimizer::printverbosesteps(actuation act)
 {
 	static int last_frequency_idx = 0;
@@ -130,6 +130,7 @@ void localoptimizer::printverbosesteps(actuation act)
 	std::cout << " , CPU Frequency to " << frequencies_param_allowed_[last_frequency_idx]
 			  << std::endl;
 }
+#endif
 
 void localoptimizer::accumulate_objective_measurements()
 {
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 2b865bf..3caa522 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -22,12 +22,12 @@
 
 //#define DEBUG_ 1
 //#define DEBUG_INIT_ 1 // define to generate output during scheduler initialization
-#define DEBUG_MULTIOBJECTIVE_ 1
+// #define DEBUG_MULTIOBJECTIVE_ 1
 //#define DEBUG_THREADTHROTTLING_ 1
 //#define DEBUG_THREADSTATUS_ 1
 //#define DEBUG_FREQSCALING_ 1
 //#define MEASURE_MANUAL 1 // define to generate output consumed by the regression test
-#define MEASURE_ 1
+// #define MEASURE_ 1
 // only meant to be defined if one needs to measure the efficacy
 // of the scheduler
 #undef DEBUG_

From 36a95e282121b87485403f3ff60878b4699d7109 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Tue, 8 Jan 2019 11:12:23 +0000
Subject: [PATCH 35/37] Enable elasticity when *any* kind of objective is
 selected

---
 src/scheduler.cpp | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/src/scheduler.cpp b/src/scheduler.cpp
index f1be5bc..688b682 100644
--- a/src/scheduler.cpp
+++ b/src/scheduler.cpp
@@ -75,7 +75,7 @@ namespace allscale
 
         std::string input_objective_str = hpx::get_config_entry("allscale.objective", "none");
 //         std::cerr << "  Scheduler objective is " << input_objective_str << "\n";
-        bool enable_elasticity = false;
+        bool enable_elasticity = true;
         if ( !input_objective_str.empty() )
         {
             std::istringstream iss_leeways(input_objective_str);
@@ -92,17 +92,6 @@ namespace allscale
                     obj = objective_str.substr(0, idx);
                     leeway = std::stod( objective_str.substr(idx + 1) );
                 }
-
-                if (obj == "time")
-                {
-                    enable_elasticity = true;
-                    break;
-                }
-                else if (obj == "resource")
-                {
-                    enable_elasticity = true;
-                    break;
-                }
             }
         }
 

From 0d518ed267ac63caa56076df10fb66da36a3c5bf Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Tue, 8 Jan 2019 11:14:22 +0000
Subject: [PATCH 36/37] Enable elasticity when *any* kind of objective is
 selected

previous commit was incomplete ...
---
 src/scheduler.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/scheduler.cpp b/src/scheduler.cpp
index 688b682..05a7479 100644
--- a/src/scheduler.cpp
+++ b/src/scheduler.cpp
@@ -75,7 +75,7 @@ namespace allscale
 
         std::string input_objective_str = hpx::get_config_entry("allscale.objective", "none");
 //         std::cerr << "  Scheduler objective is " << input_objective_str << "\n";
-        bool enable_elasticity = true;
+        bool enable_elasticity = false;
         if ( !input_objective_str.empty() )
         {
             std::istringstream iss_leeways(input_objective_str);
@@ -93,6 +93,8 @@ namespace allscale
                     leeway = std::stod( objective_str.substr(idx + 1) );
                 }
             }
+            
+            enable_elasticity = true;
         }
 
         rp.set_default_pool_name("allscale-numa-0");

From 9264d07a897b684baedf3f7bf02a9b8e25951d65 Mon Sep 17 00:00:00 2001
From: Vassilis Vassiliadis <Vassilis.Vassiliadis@ibm.com>
Date: Tue, 8 Jan 2019 14:21:59 +0000
Subject: [PATCH 37/37] Fixed segmentation fault in power upkeeping

---
 src/components/scheduler_component.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 3caa522..1a5ae8e 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -675,14 +675,16 @@ void scheduler::optimize_locally(work_item const& work)
                         active_threads};
                 lopt_.measureObjective(current_avg_iter_time,power_sum/(last_power_usage*monitor_c->get_max_power()),
                         active_threads);
-                last_power_usage=0;
-                power_sum=0;
 
                 last_objective_score = lopt_.evaluate_score(last_objectives);
 
                 auto power_dt = t_duration_now - last_measure_power;
                 update_power_consumption(power_sum/last_power_usage, power_dt);
                 last_measure_power = t_duration_now;
+
+                // VV: instead of starting from scratch, remember the last power measurement
+                last_power_usage=1;
+                power_sum=current_power_usage;
             }
 
             elapsedTimeMs = t_duration_now - last_optimization_timestamp_;