diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp
index c0e588a..9e11ebb 100644
--- a/allscale/components/localoptimizer.hpp
+++ b/allscale/components/localoptimizer.hpp
@@ -3,6 +3,7 @@
 #define ALLSCALE_COMPONENTS_LOCALOPTIMIZER_HPP
 
 #include <allscale/components/nmsimplex_bbincr.hpp>
+
 #if defined(ALLSCALE_HAVE_CPUFREQ)
 #include <allscale/util/hardware_reconf.hpp>
 #endif
@@ -14,223 +15,214 @@
 #include <string>
 
 //#define MEASURE_MANUAL_ 1
-#define MEASURE_ 1
-//#define DEBUG_ 1
-
-namespace allscale { namespace components {
-
-    enum objectiveType {time,energy,resource};
-
-    enum parameterType {thread, frequency};
-
-    enum searchPolicy {allscale, random, manual};
-
-    /* structure type of a single optimization objective */
-    struct objective{
-      objectiveType type;
-      /* leeway threshold desired, 0-1 double */
-      double leeway;
-      /* non-negative integer priority of the objective, 0 is highest priority*/
-      int priority;
-      /* local minimum during single objective optimization */
-      double localmin;
-      /* local maximum during single objective optimization */
-      double localmax;
-      /* local minimum during single objective optimization */
-      double globalmin;
-      /* local minimum during single objective optimization */
-      double globalmax;
-      /* current deviation of the objective value from observed min */
-      double currentthreshold;
-      /* sampled objective values throughout execution */
-      std::vector<double> samples;
-      /* thread number that lead to the objective value in samples vector */
-      std::vector<double> threads_samples;
-      /* frequency index that lead to the objective value in samples vector */
-      std::vector<double> freq_samples;
-      /* true if optimization of objective has converged, false otherwise */
-      bool converged;
-      /* true if optimizer for objective has been initialized, false otherwise */
-      bool initialized;
-      /* index to the parameter vectors for setup that has so far achieved
-         the minimum over all samples */
-      long int min_params_idx;
-      double converged_minimum;
-      double minimization_params[2];
-    };
-
-
-    /* structure type modelling an optimization actuation action to be taken
+// #define MEASURE_ 1
+// #define DEBUG_ 1
+// #define DEBUG_MULTIOBJECTIVE_ 1
+
+namespace allscale
+{
+namespace components
+{
+
+enum objectiveType
+{
+	time,
+	energy,
+	resource
+};
+
+enum searchPolicy
+{
+	allscale,
+	random,
+	manual,
+	none
+};
+
+
+/* structure type modelling an optimization actuation action to be taken
        by the scheduler */
-    struct actuation{
-       /* number of threads to resume (>0) or suspend (<0). If set to zero,
-          number of threads will stay unchanged. */
-       unsigned int delta_threads;
-
-#if defined(ALLSCALE_HAVE_CPUFREQ)
-        /* index to the global cpu-supported frequencies vector pointing to
-           the new frequency to be set. If set to -1, frequency will stay
-           unchanged */
-       unsigned int frequency_idx;
-#endif
-    };
-
-    struct localoptimizer
-    {
-        localoptimizer()
-            :nmd(0.01),
-#if defined(ALLSCALE_HAVE_CPUFREQ)
-            frequency_param_(0),
-#endif
-            current_objective_idx_(0),converged_(false)
-    {
-            if (optmethod_==random)
-                srand (std::time(NULL));
-            }
-
-        localoptimizer(std::list<objective>);
-
-        bool isConverged(){return converged_;}
-
-        void setPolicy(searchPolicy pol){
-          optmethod_ = pol;
+struct actuation
+{
+	unsigned int threads;
+	int frequency_idx;
+};
+
+struct localoptimizer
+{
+	localoptimizer();
+	bool isConverged();
+	double evaluate_score(const double objectives[]);
+	void setPolicy(searchPolicy pol)
+	{
+		optmethod_ = pol;
 #ifdef DEBUG_
-          std::cout << "Local Optimizer Initialized with "
-                    << policyToString(pol)
-                    << " policy for single objective search."
-                    << std::endl;
+		std::cout << "Local Optimizer Initialized with "
+				  << policyToString(pol)
+				  << " policy for multi-objective search."
+				  << std::endl;
 #endif
-        }
-
-        searchPolicy getPolicy(){return optmethod_;}
-
-        void setobjectives(std::list<objective>);
-
-        std::size_t getCurrentThreads(){return threads_param_;}
+	}
+	void initialize_nmd(bool from_scratch);
+	searchPolicy getPolicy() { return optmethod_; }
+	
+	// VV: Modifying the objectives triggers restarting the optimizer
+	void setobjectives(double time_weight, 
+						double energy_weight, 
+						double resource_weight);
+
+	void getobjectives(double *time_weight, 
+					   double *energy_weight,
+					   double *resource_weight)
+	{
+		if ( time_weight != nullptr )
+			*time_weight = this->time_weight;
+		if ( energy_weight != nullptr )
+			*energy_weight = this->energy_weight;
+		if ( resource_weight != nullptr )
+			*resource_weight = this->resource_weight;
+	}
+
+	void set_objectives_scale(const double objectives_scale[3]);
+
+	std::size_t getCurrentThreads() { return threads_param_; }
+
+	void setCurrentThreads(std::size_t threads) { threads_param_ = threads; }
+
+	unsigned int getCurrentFrequencyIdx()
+	{
+		return frequency_param_;
+	}
+
+	void setCurrentFrequencyIdx(unsigned int idx) { frequency_param_ = idx; }
+
+	const std::vector<unsigned long>
+	setfrequencies(std::vector<unsigned long> frequencies)
+	{
+		#if 0
+		const std::size_t max_freqs = 10;
+		std::size_t keep_every = (std::size_t) ceilf(frequencies.size() / (float) max_freqs);
+
+		if ( keep_every > 1 ) {
+			std::vector<unsigned long> new_freqs;
+
+			int i, j, len;
+
+			for (j=0, i=0, len=frequencies.size(); i<len; ++i ) {
+				if ( (i==len-1) || ( (i % keep_every) == 0 )) {
+				new_freqs.push_back(frequencies[i]);
+				}
+			}      
+
+			frequencies = new_freqs;
+		}
+		#endif
+
+		frequencies_param_allowed_ = frequencies;
+		//std::cout << "**************** = " << frequency_param_ << std::endl;
+		//for(auto& el: frequencies_param_allowed_)
+		//  std::cout << "***>>>> " << el << std::endl;
+		return frequencies_param_allowed_;
+	}
+
+	std::size_t getmaxthreads()
+	{
+		return max_threads_;
+	}
+
+	void setmaxthreads(std::size_t threads);
+
+	/* executes one step of multi-objective optimization */
+	actuation step(std::size_t active_threads);
+
+	/* adds a measurement sample to the specified objective */
+	void measureObjective(double iter_time, double power, double threads);
+
+	/* restarts multi-objective optimization from current best solution */
+	void reset(int, int);
 
-        void setCurrentThreads(std::size_t threads){threads_param_ = threads;}
-
-#if defined(ALLSCALE_HAVE_CPUFREQ)
-        unsigned int getCurrentFrequencyIdx(){return frequency_param_;}
-
-        void setCurrentFrequencyIdx(unsigned int idx){frequency_param_ = idx;}
-
-        const std::vector<unsigned long>
-        setfrequencies(std::vector<unsigned long> frequencies){
-            frequencies_param_allowed_=frequencies;
-            //std::cout << "**************** = " << frequency_param_ << std::endl;
-            //for(auto& el: frequencies_param_allowed_)
-            //  std::cout << "***>>>> " << el << std::endl;
-            return frequencies_param_allowed_;
-        }
+#ifdef DEBUG_
+	void printobjectives();
+	void printverbosesteps(actuation);
 #endif
 
-        void setmaxthreads(std::size_t threads){
-            max_threads_=threads;
-            threads_param_=threads;
-        }
+	std::string policyToString(searchPolicy pol)
+	{
+		std::string str;
+		switch (pol)
+		{
+		case random:
+			str = "random";
+			break;
+		case allscale:
+			str = "allscale";
+			break;
+		case manual:
+			str = "manual";
+			break;
+		}
+		return str;
+	}
 
-        /* executes one step of multi-objective optimization */
-        actuation step();
+  private:
+	double time_weight, energy_weight, resource_weight;
 
-        /* adds a measurement sample to the specified objective */
-        void measureObjective(double iter_time, double power, double threads);
+	// VV: Used to convert thread_idx to actual number of threads
+	std::size_t threads_dt;
 
-        /* restarts multi-objective optimization from current best solution */
-        void reset(int,int);
+	void accumulate_objective_measurements();
+	void reset_accumulated_measurements();
 
-#ifdef DEBUG_
-        void printobjectives();
-        void printverbosesteps(actuation);
-#endif
-
-        std::string policyToString(searchPolicy pol){
-          std::string str;
-          switch (pol){
-            case random:
-              str = "random";
-              break;
-            case allscale:
-              str = "allscale";
-              break;
-            case manual:
-              str = "manual";
-              break;
-          }
-          return str;
-        }
+	std::vector<double> samples_energy;
+	std::vector<double> samples_time;
+	std::vector<double> samples_threads;
+	std::vector<double> samples_freq;
 
-    private:
+	bool explore_knob_domain;
 
-        /* vector of active optimization objectives. Objectives are stored
-           in the vector in decreasing priority order */
-        std::vector<objective> objectives_;
+	double pending_time, pending_energy, pending_threads;
+	unsigned long pending_num_times;
 
-        NelderMead nmd;
+	bool mo_initialized;
 
-        /* counts number of parameter changes (as pair) */
-        unsigned long long int param_changes_;
+	NelderMead nmd;
 
-        /* single objective optimization method used */
-        searchPolicy optmethod_ = random;
+	/* single objective optimization method used */
+	searchPolicy optmethod_ = none;
 
-        /* active optimization parameter - nr of OS threads active */
-        int threads_param_;
+	/* active optimization parameter - nr of OS threads active */
+	int threads_param_;
 
-        /* ordered set of OS thread values that have been assigned to the
+	/* ordered set of OS thread values that have been assigned to the
            runtime by the optimization algorithm. The most recent value is
            stored at the end of the vector */
-        std::vector<unsigned long> thread_param_values_;
+	std::vector<unsigned long> thread_param_values_;
 
-        /* maximum number of OS threads supported by the runtime */
-        std::size_t max_threads_;
+	/* maximum number of OS threads supported by the runtime */
+	std::size_t max_threads_;
 
-#if defined(ALLSCALE_HAVE_CPUFREQ)
-        /* active optimization parameter - current CPU frequency index */
-        unsigned int frequency_param_;
+	/* active optimization parameter - current CPU frequency index */
+	unsigned int frequency_param_;
 
-        /* ordered set of frequency values that the CPU has been set to by
-           the optimization algorithm. The most recent value is stored at the
-           end of the vector */
-        std::vector<unsigned long> frequency_param_values_;
-
-        /* vector containing sorted list of frequencies supported by the
+	/* vector containing sorted list of frequencies supported by the
            processor */
-        std::vector<unsigned long> frequencies_param_allowed_;
-
-        /* index to the vector of allowed frequencies that points to the highest
-           frequency. The ordering of the vector, as reported by hardware
-           reconfiguration can be platform specific, and therefore we need this
-           index to make sorted access to the vector platform agnostic */
-        const short unsigned int highest_frequency_allowed_idx_ = 0;
-#endif
-
-        /* threshold (percentage in [0,1]) to decide convergence of optimization
-           steps against a single objective */
-        const double convergence_threshold_ = 0.02;
-
-        /***** optimization state variables ******/
-
-        /* index to the _objectives vector of currently optimized objective */
-        unsigned short int current_objective_idx_;
+	std::vector<unsigned long> frequencies_param_allowed_;
 
-        /* number of times the optimizer step() has been invoked, this is for
-           init and housekeeping purposes */
-         unsigned long long int steps_;
+	/* threshold (percentage in [0,1]) to decide convergence of optimization
+           steps */
+	double convergence_threshold_;
 
-        /* currently optimized parameter */
-        parameterType current_param_;
+	/***** optimization state variables ******/
 
-        /* initial warm-up steps */
-        const unsigned int warmup_steps_=3;
+	/* initial warm-up steps */
+	const unsigned int warmup_steps_ = 3;
 
-        /* maximum number of optimization steps allowed */
-        const int max_steps_=100;
+	/* set to true if local optimizer has converged over all objectives */
+	bool converged_;
 
-        /* set to true if local optimizer has converged over all objectives */
-        bool converged_;
-    };
-}
-}
+	double objectives_scale[3];
+};
+} // namespace components
+} // namespace allscale
 
 #endif
diff --git a/allscale/components/nmd.hpp b/allscale/components/nmd.hpp
new file mode 100644
index 0000000..7f462ff
--- /dev/null
+++ b/allscale/components/nmd.hpp
@@ -0,0 +1,162 @@
+/*
+Nelder Mead implementation for arbitrary number of knobs and number of objectives.
+
+Developed explicitly for non-continuous search spaces.
+
+Important information
+---------------------
+
+This implementation uses a cache coupled with the exploration-heuristic that is explained
+bellow to refrain from evaluating the same set of knobs multiple times.
+
+If NMD proposes to explore a knob-set that has been recently evaluated (i.e. there's a
+non stale entry in the cache) the heuristic will instead propose the closest point that is
+enclosed within the N-dimensional (where N = num_knobs) space near the knob set that NMD
+initially proposed. The N-dimensional space takes a form of a square, Cube, Hypercube for
+N=2, 3, 4. Each edge may be at most @max_distance_long (see generate_unique) for more info.
+
+author: vasiliadis.vasilis@gmail.com
+*/
+#ifndef ALLSCALE_NMD_HEADER
+#define ALLSCALE_NMD_HEADER
+#include <cstddef>
+#include <map>
+#include <set>
+#include <vector>
+#include <cmath>
+
+namespace allscale {
+namespace components {
+
+struct logistics {
+    std::vector<double> objectives;
+    std::vector<std::size_t> knobs;
+
+    int64_t cache_ts, cache_dt;
+
+    bool converged;
+};
+
+#define ALPHA 1.0   /* reflection coefficient */
+#define BETA 0.5	/* contraction coefficient */
+#define GAMMA 2.0   /* expansion coefficient */
+#define DELTA 0.5   /* shrinking coefficient */
+
+class NmdGeneric {
+public:
+    NmdGeneric();
+    NmdGeneric(std::size_t num_knobs, std::size_t num_objectives,
+               double conv_threshold, int64_t cache_expire_dt_ms,
+               std::size_t max_iters);
+
+    static double score_speed_efficiency_power(const double measurements[], const double weights[])
+    {
+        double ret = std::pow(measurements[0], weights[0]) *
+                    std::pow(measurements[1], weights[1]) *
+                    std::pow((1-measurements[2]), weights[2]);
+        
+        if ( std::isfinite(ret) == 0  || ret > 1.0 ) {
+            ret = 1.0;
+        }
+        
+        return 1.0 - ret;
+    }
+
+    void initialize(const std::size_t constraint_min[], const std::size_t constraint_max[],
+                    const std::size_t *initial_config[], const double weights[],
+                    double (*score_function)(const double[], const double []));
+
+    void ensure_profile_consistency(std::size_t expected[], const std::size_t observed[]) const;
+
+    void set_constraints_now(const std::size_t constraint_min[], 
+                             const std::size_t constraint_max[]);
+
+    double score(const double measurements[]) const;
+
+    std::pair<std::vector<std::size_t>, bool> get_next(const double measurements[], 
+                            const std::size_t observed_knobs[]);
+
+protected:
+    bool test_convergence();
+
+    // VV: (measurements, weights) returns value in range [0.0, infinite)
+    //     0.0 means perfect score (i.e. the larger the score, the worse it is)
+    double (*score_function)(const double[], const double []);
+
+    std::vector<std::size_t> do_warmup(const double measurements[], 
+                            const std::size_t observed_knobs[]);
+    std::vector<std::size_t> do_reflect(const double measurements[], 
+                            const std::size_t observed_knobs[]);
+    std::vector<std::size_t> do_expand(const double measurements[], 
+                            const std::size_t observed_knobs[]);
+    std::vector<std::size_t> do_contract_in(const double measurements[], 
+                            const std::size_t observed_knobs[]);
+    std::vector<std::size_t> do_contract_out(const double measurements[], 
+                            const std::size_t observed_knobs[]);
+    std::vector<std::size_t> do_shrink();
+    std::vector<std::size_t> do_start(bool consult_cache);
+
+    void sort_simplex(bool consult_cache=true);
+    void compute_centroid();
+
+    void generate_unique(std::size_t initial[], bool accept_stale,
+                        const std::set<std::vector<std::size_t> > *extra) const;
+    std::size_t compute_max_combinations() const;
+
+    template<typename T>
+    void apply_constraint(T knobs[]) const
+    {
+        for (auto i=0ul; i<num_knobs; ++i) {
+            if ( knobs[i] < (T) constraint_min[i] )
+                knobs[i] = constraint_min[i];
+            if ( knobs[i] > (T) constraint_max[i] )
+                knobs[i] = constraint_max[i];
+        }
+    }
+
+    //VV: Used to generate all possible combinations of +-
+    // from: https://stackoverflow.com/questions/4633584/
+    template <typename Iter>
+    bool next_binary(Iter begin, Iter end) const
+    {
+        while (begin != end)       // we're not done yet
+        {
+            --end;
+            if ((*end & 1) == 0)   // even number is treated as zero
+            {
+                ++*end;            // increase to one
+                return true;       // still more numbers to come
+            }
+            else                   // odd number is treated as one
+            {
+                --*end;            // decrease to zero and loop
+            }
+        }
+        return false;              // that was the last number 
+    }
+
+    enum estate {warmup, start, reflect, expand, contract_in, contract_out, shrink};
+    estate current_state;
+    std::size_t warmup_step;
+
+    double conv_threshold;
+    std::size_t num_knobs;
+    std::size_t num_objectives;
+
+    double *scores;
+    std::size_t **simplex, **initial_config;
+    std::size_t *constraint_max, *constraint_min;
+    std::size_t *point_reflect, *point_contract, *point_expand, *centroid;
+    std::map< std::vector<std::size_t>, logistics> cache;
+    int64_t cache_expire_dt_ms;
+    double *weights;
+    std::size_t times_reentered_start;
+    double score_reflect, score_contract, score_expand;
+    bool final_explore;
+    std::size_t iteration, max_iters;
+};
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp
index f894d2b..891938d 100644
--- a/allscale/components/nmsimplex_bbincr.hpp
+++ b/allscale/components/nmsimplex_bbincr.hpp
@@ -18,136 +18,244 @@
 #include <stdlib.h>
 #include <math.h>
 
+#include <chrono>
+#include <utility>
+#include <map>
+
 #ifdef MACOSX
 #include <malloc/malloc.h>
 #else
 #include <malloc.h>
 #endif
 
-namespace allscale { namespace components {
+namespace allscale
+{
+namespace components
+{
+
+// VV: threads, freq_idx
+#define NMD_NUM_KNOBS 2
+// VV: time, energy/power, resources
+#define NMD_NUM_OBJECTIVES 3
+
+
+#if (NMD_NUM_OBJECTIVES != 3)
+#error UNSUPPORTED number of Objectives
+#endif
+
+#if (NMD_NUM_KNOBS != 2)
+#error UNSUPPORTED number of Knobs
+#endif
+
+#define MAX_IT 1000 /* maximum number of iterations */
+#define ALPHA 1.0   /* reflection coefficient */
+#define BETA 0.5	/* contraction coefficient */
+#define GAMMA 2.0   /* expansion coefficient */
+#define DELTA 0.5   /* shrinking coefficient */
 
-#define MAX_IT      1000      /* maximum number of iterations */
-#define ALPHA       1.0       /* reflection coefficient */
-#define BETA        0.5       /* contraction coefficient */
-#define GAMMA       2.0       /* expansion coefficient */
+#define CACHE_EXPIRE_AFTER_MS 35000
 
 /* structure type of a single optimization step return status */
-struct optstepresult{
-      /* true if optimization has converged for the specified objective */
-      bool converged;
-      /* number of threads for parameters to set for sampling */
-      double threads;
-      /* index to frequency vector for freq parameter to set for sampling*/
-      int freq_idx;
+struct optstepresult
+{
+	/* true if optimization has converged for the specified objective */
+	bool converged;
+	/* number of threads for parameters to set for sampling */
+	double threads;
+	/* index to frequency vector for freq parameter to set for sampling*/
+	int freq_idx;
+	
+	/******VV: Cache stuff******/
+	double objectives[3]; // (time, energy, resource)
+	// VV: _cache_expires denotes dt (in ms) after _cache_timestamp
+	int64_t _cache_timestamp, _cache_expires_dt;
 };
 
+typedef std::map<std::pair<int, int>, optstepresult> MapCache_t;
+
 /* enumeration encoding state that the incremental Nelder Mead optimizer is at */
-enum iterationstates {start, reflection, expansion, contraction};
+enum iterationstates
+{
+	// VV: Need NMD_NUM_KNOBS + 1 values before we can start optimizing
+	warmup,
+	start,
+	reflection,
+	expansion,
+	contraction_in,
+	contraction_out,
+	shrink
+};
+
 
-class NelderMead {
+class NelderMead
+{
 
   public:
-    NelderMead(double);
-    void initialize_simplex(double params[][2], double*,double*,double*);
-    void print_initial_simplex();
-    void print_iteration();
-    optstepresult step(double param);
-    double* getMinVertices(){
-        return v[vs];
-    }
+	NelderMead(const NelderMead &other);
+	NelderMead(double);
+	// VV: For the time being 
+	//     weights = [ W_time, W_energy/power, W_resources ]
+	//     initial_simplex = double[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS]
+	//     constraint_min = [min_threads, min_freq_idx]
+	void initialize_simplex(const double weights[NMD_NUM_OBJECTIVES],
+							const double initial_simplex[][NMD_NUM_KNOBS],
+							const double constraint_min[NMD_NUM_KNOBS],
+							const double constraint_max[NMD_NUM_KNOBS]);
+	/*
+	void initialize_simplex(const double weights[NMD_NUM_OBJECTIVES],
+							const double constraint_min[NMD_NUM_KNOBS],
+							const double constraint_max[NMD_NUM_KNOBS]);
+	*/
+
+	void print_initial_simplex();
+	void print_iteration();
+
+	void set_scale(const double scale[NMD_NUM_OBJECTIVES]);
+
+	double *getMinVertices()
+	{
+		return v[vs];
+	}
+
+	double getMinObjective()
+	{
+		return min;
+	}
+
+	// VV: Returns a [NMD_NUM_KNOS+1][NMD_NUM_KNOBS] array
+	void get_simplex(double simplex[][NMD_NUM_KNOBS]) {
+		for (auto i=0; i<NMD_NUM_KNOBS+1; ++i)
+			for (auto j=0; j<NMD_NUM_KNOBS; ++j)
+				simplex[i][j] = v[i][j];
+	}
+
+	unsigned long int getIterations() { return itr; }
+	double evaluate_score(const double objectives[], const double *weights);
+	void set_weights(const double weights[]);
+
+	optstepresult step(const double objectives[], 
+			double knob1, double knob2);
+
+	void invalidate_cache();
+	void reevaluate_scores();
+
+	void update_constraints(const double constraint_min[NMD_NUM_KNOBS],
+							const double constraint_max[NMD_NUM_KNOBS]);
 
-    double getMinObjective(){
-        return min;
-    }
+  private:
+	int warming_up_step;
+	bool should_invalidate_cache, should_reevaluate_scores;
+	double max_power_, max_time_;
 
-    unsigned long int getIterations(){return itr;}
+	// VV: Utility to make sure that we generate new values and not something that already
+	//     exists in the set of NMD_NUM_KNOBS+1 configuration points
+	template <typename F>
+	void generate_new(F &gen);
+	enum direction {up, up_final, down, left, right, right_final};
+	std::pair<int, direction> explore_next_extra(double *extra, int level, 
+                        direction dir, int level_max, int level_nested_max);
 
-  private:
-    int vg_index();
-    int vs_index();
-    int vh_index();
-    void my_constraints(double*);
-    void centroid();
-    bool testConvergence();
-    void updateObjectives();
+	//VV: objective_type: { <threads, cpu-freq>: optstepresult }
+	MapCache_t cache_;
 
-    double round2(double num, int precision)
-    {
-      double rnum = 0.0;
-      int tnum;
+	void do_invalidate_cache();
+	void do_reevaluate_scores();
 
-      if (num == 0.0)
-        return num;
+	optstepresult do_step_start();
+	optstepresult do_step_reflect(const double objectives[], 
+			double knob1, double knob2);
+	optstepresult do_step_expand(const double objectives[], 
+			double knob1, double knob2);
+	optstepresult do_step_contract_in(const double objectives[], 
+			double knob1, double knob2);
+	optstepresult do_step_contract_out(const double objectives[], 
+			double knob1, double knob2);
+	optstepresult do_step_shrink();
+	optstepresult do_step_warmup(const double objectives[], 
+			double knob1, double knob2);
 
-      rnum = num*pow(10,precision);
-      tnum = (int)(rnum < 0 ? rnum-0.5 : rnum + 0.5);
-      rnum = tnum/pow(10,precision);
+	void sort_vertices(void);
+	void my_constraints(double *);
+	void centroid();
+	bool testConvergence(std::size_t tested_combinations);
 
-      return rnum;
-    }
+	// VV: Will return false if entry not in cache
+	bool cache_update(int threads, int freq_idx,
+					  const double objectives[],
+					  bool add_if_new);
 
-    /* vertex with smallest value */
-    int vs;         
+	bool convergence_reevaluating;
+	int initial_configurations[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS];
+	double scale[NMD_NUM_OBJECTIVES];
+	/* vertex with smallest value */
+	int vs;
 
-     /* vertex with next smallest value */
-    int vh;        
+	/* vertex with next smallest value */
+	int vh;
 
-    /* vertex with largest value */
-    int vg;         
-	
-    int i,j,row;
+	/* vertex with largest value */
+	int vg;
 
-    const int n=2;
+	int i, j, row;
 
-    /* track the number of function evaluations */
-    int k;
+	const int n = 2;
 
-    /* track the number of iterations */
-    int itr;	  
-	
-    /* holds vertices of simplex */
-    double **v;
+	/* track the number of function evaluations */
+	int k;
 
-     /* value of function at each vertex */
-    double *f;
+	/* track the number of iterations */
+	int itr;
 
-    /* value of function at reflection point */
-    double fr;      
+	/* holds vertices of simplex */
+	double v[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS];
 
-    /* value of function at expansion point */
-    double fe;      
+	/* value of function at each vertex */
+	double f[NMD_NUM_KNOBS+1];
 
-     /* value of function at contraction point */
-    double fc;      
+	/* value of function at reflection point */
+	double fr;
 
-    /* reflection - coordinates */
-    double *vr;    
+	/* value of function at expansion point */
+	double fe;
 
-    /* expansion - coordinates */
-    double *ve;     
+	/* value of function at contraction point */
+	double fc;
 
-    /* contraction - coordinates */
-    double *vc;     
+	/* reflection - coordinates */
+	double vr[NMD_NUM_KNOBS];
 
-    /* centroid - coordinates */
-    double *vm;     
+	/* expansion - coordinates */
+	double ve[NMD_NUM_KNOBS];
 
-    double min;
-	
-    double fsum,favg,s;
+	/* contraction - coordinates */
+	double vc[NMD_NUM_KNOBS];
+
+	/* centroid - coordinates */
+	double vm[NMD_NUM_KNOBS];
+
+	double min;
+
+	double fsum, favg, s;
+
+	double EPSILON;
+
+	iterationstates state_;
+
+	const int MAXITERATIONS = 15;
 
-    double EPSILON;
+	double constraint_min[2];
 
-    iterationstates state_;
+	double constraint_max[2];
 
-    const int MAXITERATIONS = 15;
-  
-    double constraint_min[2];
+	double opt_weights[NMD_NUM_OBJECTIVES];
 
-    double constraint_max[2];
+	double next_constraint_min[NMD_NUM_KNOBS],
+			next_constraint_max[NMD_NUM_KNOBS];
+	bool should_update_constraints = false;
 
+	int times_used_cached;
 };
 
-}
-}
+} // namespace components
+} // namespace allscale
 #endif
diff --git a/allscale/components/scheduler.hpp b/allscale/components/scheduler.hpp
index 7eed6e5..9eb9fbf 100644
--- a/allscale/components/scheduler.hpp
+++ b/allscale/components/scheduler.hpp
@@ -5,8 +5,10 @@
 #include <allscale/work_item.hpp>
 #include <allscale/components/treeture_buffer.hpp>
 #include <allscale/components/localoptimizer.hpp>
+
 #if defined(ALLSCALE_HAVE_CPUFREQ)
 #include <allscale/util/hardware_reconf.hpp>
+#else
 #endif
 
 #include <hpx/include/components.hpp>
@@ -44,6 +46,8 @@ namespace allscale { namespace components {
             HPX_ASSERT(false);
         }
 
+        bool get_optimization_score();
+
         scheduler(std::uint64_t rank);
         void init();
 
@@ -64,6 +68,22 @@ namespace allscale { namespace components {
             return active_threads;
         }
 
+        std::size_t get_total_threads() const {
+                return os_thread_count;
+        }
+        
+        void set_local_optimizer_weights(double time_weight, 
+                                         double energy_weight,
+                                         double resource_weight);
+        void get_local_optimizer_weights(double *time_weight, 
+                                         double *energy_weight,
+                                         double *resource_weight);
+        
+        void update_max_threads(std::size_t max_threads);
+
+        double get_last_objective_score() {
+                return last_objective_score;
+        }
     private:
 
         std::size_t get_num_numa_nodes();
@@ -82,18 +102,19 @@ namespace allscale { namespace components {
         bool do_split(work_item const& work, std::size_t numa_node);
 
         bool collect_counters();
-        //try to suspend resource_step threads, return number of threads which received a new suspend order;
-        // REM unsigned int suspend_threads();
-        unsigned int suspend_threads(std::size_t);
-        //try to resume resource_step threads, return number of threads which received a new resume order;
-        // REM         unsigned int resume_threads();
-        unsigned int resume_threads(std::size_t);
+        //try to suspend threads, return number of threads which received a new suspend order;
+                unsigned int suspend_threads(std::size_t);
+        
+        //try to resume threads, return number of threads which received a new resume order;
+                unsigned int resume_threads(std::size_t);
 
 #ifdef MEASURE_
         // convenience methods to update measured metrics of interest
-        void update_active_osthreads(std::size_t);
-        void update_power_consumption(std::size_t);
+        void update_active_osthreads(std::size_t threads, int64_t delta_time);
+        void update_power_consumption(std::size_t power_sample, int64_t delta_time);
 #endif
+        double last_objective_score;
+        int64_t last_measure_power, last_measure_threads;
 
         void fix_allcores_frequencies(int index);
 
@@ -108,7 +129,7 @@ namespace allscale { namespace components {
         long last_optimization_timestamp_;
 
         /* periodicity in milliseconds to invoke the optimizer */
-        const long optimization_period_ms = 5;
+        const long optimization_period_ms = 1000;
 
         /* captures absolute timestamp of the last time optimization
            objective value have been measured (sampled) */
@@ -117,7 +138,7 @@ namespace allscale { namespace components {
         long last_objective_measurement_timestamp_;
 
         /* periodicity in milliseconds to invoke objective sampling */
-        const long objective_measurement_period_ms = 1;
+        const long objective_measurement_period_ms = 500;
 
         //extra masks to better handle suspending/resuming threads
         std::vector<hpx::threads::thread_pool_base*> thread_pools_;
@@ -153,19 +174,12 @@ namespace allscale { namespace components {
         unsigned long long last_power_usage;
         unsigned long long power_sum;
         unsigned long long power_count;
+
 #if defined(ALLSCALE_HAVE_CPUFREQ)
         cpufreq_policy policy;
         hardware_reconf::hw_topology topo;
-        std::vector<unsigned long> cpu_freqs;
-        // Indices correspond to the freq id in cpu_freqs, and
-        // each pair holds energy usage and execution time
-        std::vector<std::pair<unsigned long long, double>> freq_times;
-        std::vector<std::vector<std::pair<unsigned long long, double>>> objectives_status;
-        unsigned int freq_step;
-        bool target_freq_found;
 #endif
-        unsigned int resource_step;
-        bool target_resource_found;
+        std::vector<unsigned long> cpu_freqs;
 
         mutable mutex_type throttle_mtx_;
         mutable mutex_type resize_mtx_;
@@ -186,9 +200,9 @@ namespace allscale { namespace components {
         bool resource_requested;
         bool energy_requested;
 
-        double time_leeway;
-        double resource_leeway;
-        double energy_leeway;
+        double time_weight;
+        double resource_weight;
+        double energy_weight;
         unsigned int period_for_time;
         unsigned int period_for_resource;
         unsigned int period_for_power;
diff --git a/allscale/dashboard.hpp b/allscale/dashboard.hpp
index 73670a2..eb77398 100644
--- a/allscale/dashboard.hpp
+++ b/allscale/dashboard.hpp
@@ -89,8 +89,10 @@ namespace allscale { namespace dashboard
 
         // current power usage / max power usage \in [0..1]
         float power = 0;
-
+        
         std::string to_json() const;
+        
+        float last_local_score;
 
         template <typename Archive>
         void serialize(Archive& ar, unsigned);
diff --git a/allscale/optimizer.hpp b/allscale/optimizer.hpp
index fc64428..a255497 100644
--- a/allscale/optimizer.hpp
+++ b/allscale/optimizer.hpp
@@ -11,6 +11,8 @@
 #include <hpx/lcos/future.hpp>
 #include <hpx/traits/is_bitwise_serializable.hpp>
 
+#include <allscale/components/nmsimplex_bbincr.hpp>
+
 #include <iosfwd>
 #include <vector>
 
@@ -23,6 +25,7 @@ namespace allscale {
         float avg_time_;
         unsigned long long energy_;
         std::uint64_t active_frequency_;
+        std::size_t active_cores_per_node_;
         std::size_t cores_per_node_;
 
         template <typename Archive>
@@ -33,6 +36,7 @@ namespace allscale {
             ar & avg_time_;
             ar & energy_;
             ar & active_frequency_;
+            ar & active_cores_per_node_;
             ar & cores_per_node_;
         }
     };
@@ -87,16 +91,35 @@ namespace allscale {
           , f_resource_max(other.f_resource_max)
           , f_resource_leeway(other.f_resource_leeway)
           , o_ino(std::move(o_ino))
-        {}
+          // VV: Used by balance_ino_nmd
+          , nmd_initialized(other.nmd_initialized)
+          , nmd(other.nmd)
+          , nodes_min(other.nodes_min)
+          , nodes_max(other.nodes_max)
+          , threads_min(other.threads_min)
+          , threads_max(other.threads_max)
+          , previous_num_nodes(other.previous_num_nodes)
+          , use_lopt(other.use_lopt)
+          , last_optimization_score(other.last_optimization_score)
+        {
+            objectives_scale[0] = other.objectives_scale[0];
+            objectives_scale[1] = other.objectives_scale[1];
+            objectives_scale[2] = other.objectives_scale[2];
+        }
 
         bool active() const
         {
             return active_;
         }
 
+        double get_optimization_score();
+
         hpx::future<void> balance(bool);
         hpx::future<void> balance_ino(const std::vector<std::size_t> &old_mapping);
+        hpx::future<void> balance_ino_nmd(const std::vector<std::size_t> &old_mapping);
         hpx::future<void> decide_random_mapping(const std::vector<std::size_t> &old_mapping);
+        
+        void signal_objective_changed();
 
         bool may_rebalance();
 
@@ -104,7 +127,7 @@ namespace allscale {
         std::size_t u_steps_till_rebalance;
 
         void tune(std::vector<optimizer_state> const& state);
-
+        int nmd_initialized;
         std::vector<bool> active_nodes_;
         std::uint64_t active_frequency_;
 
@@ -118,9 +141,17 @@ namespace allscale {
 
         std::vector<hpx::id_type> localities_;
 
+        // VV: balance_ino and balance_global data
         float f_resource_max, f_resource_leeway;
+        std::size_t previous_num_nodes;
+        int nodes_min, nodes_max, threads_min, threads_max;
 
         components::internode_optimizer_t o_ino;
+
+        components::NelderMead nmd;
+        double last_optimization_score;
+        double objectives_scale[3];
+        bool use_lopt;
     };
 }
 
diff --git a/allscale/scheduler.hpp b/allscale/scheduler.hpp
index 8cf6006..f448ad5 100644
--- a/allscale/scheduler.hpp
+++ b/allscale/scheduler.hpp
@@ -48,6 +48,7 @@ namespace allscale
 
         static HPX_EXPORT void update_policy(task_times const& times, std::vector<bool> mask, std::uint64_t frequency);
         static void apply_new_mapping(const std::vector<std::size_t> &new_mapping);
+        static void update_max_threads(std::size_t max_threads);
 
         static HPX_EXPORT void schedule(work_item&& work);
         static HPX_EXPORT components::scheduler* run(std::size_t rank);
diff --git a/allscale/tuner.hpp b/allscale/tuner.hpp
index da28253..f1285a8 100644
--- a/allscale/tuner.hpp
+++ b/allscale/tuner.hpp
@@ -3,6 +3,7 @@
 #define ALLSCALE_TUNER_HPP
 
 #include <allscale/tuning_objective.hpp>
+#include <allscale/components/nmd.hpp>
 
 #include <iostream>
 #include <vector>
@@ -74,6 +75,24 @@ namespace allscale {
 
         void next_direction();
     };
+
+    struct nmd_optimizer : tuner
+    {
+        nmd_optimizer(std::size_t nodes_min, std::size_t nodes_max);
+        components::NmdGeneric nmd;
+        std::vector<std::size_t> avail_freqs;
+        std::vector<std::size_t> best;
+        bool converged;
+        bool initialized;
+        // VV: even though NmdGeneric supports arbitrary number of optimization parameters
+        //     we're applying it to number of nodes and CPU frequency, it is trivial to 
+        //     add number of threads
+        std::size_t constraint_min[2], constraint_max[2];
+
+        tuner_configuration next(tuner_configuration const& current_cfg, tuner_state const& current_state, tuning_objective) override;
+
+        double previous_weights[3];
+    };
 }
 
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 25cf7c9..1481fbf 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -25,6 +25,7 @@ set(_srcs
     components/util/graph_colouring.cpp
     components/localoptimizer.cpp
     components/nmsimplex_bbincr.cpp
+    components/nmd.cpp
 )
 
 if(CPUFREQ_FOUND)
diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp
index 86faa91..da51330 100644
--- a/src/components/localoptimizer.cpp
+++ b/src/components/localoptimizer.cpp
@@ -15,432 +15,397 @@
 //#define DEBUG_MULTIOBJECTIVE_ 1
 //#define DEBUG_CONVERGENCE_ 1
 //#define MEASURE_MANUAL 1 // define to generate output consumed by the regression test
-#define MEASURE_ 1
+//#define MEASURE_ 1
 // only meant to be defined if one needs to measure the efficacy
 // of the scheduler
 //#define ALLSCALE_HAVE_CPUFREQ 1
-#define ALLSCALE_USE_CORE_OFFLINING 1
-
-namespace allscale {
-namespace components {
-
-localoptimizer::localoptimizer(std::list<objective> targetobjectives)
-  : objectives_((int)targetobjectives.size()),
-    nmd(0.01),
-    param_changes_(0),
-    steps_(0),
-    current_param_(thread),
-    converged_(false)
-  {
-    for (objective o : targetobjectives) {
-      //std::cout << o.type << "," << o.leeway << "," << o.priority << '\n';
-      objectives_[o.priority] = o;
-      objectives_[o.priority].localmin=10000;
-      objectives_[o.priority].globalmin=10000;
-      objectives_[o.priority].localmax=0.0;
-      objectives_[o.priority].globalmax=0.0;
-      objectives_[o.priority].converged=false;
-      objectives_[o.priority].initialized=false;
-      objectives_[o.priority].min_params_idx=0;
-      objectives_[o.priority].converged_minimum=0;
-    }
-#ifdef ALLSCALE_HAVE_CPUFREQ
-    setCurrentFrequencyIdx(0);
-#endif
-};
 
-void localoptimizer::setobjectives(std::list<objective> targetobjectives){
-  objectives_.clear();
-  objectives_.resize((int)targetobjectives.size());
-  for (objective o : targetobjectives) {
-    //std::cout << o.type << "," << o.leeway << "," << o.priority << '\n';
-    objectives_[o.priority] = o;
-    objectives_[o.priority].localmin=10000;
-    objectives_[o.priority].globalmin=10000;
-    objectives_[o.priority].localmax=0.0;
-    objectives_[o.priority].globalmax=0.0;
-    objectives_[o.priority].converged=false;
-    objectives_[o.priority].initialized=false;
-    objectives_[o.priority].min_params_idx=0;
-    objectives_[o.priority].converged_minimum=0;
-  }
-  steps_=0;
-  param_changes_=0;
-  current_param_=thread;
-#ifdef ALLSCALE_HAVE_CPUFREQ
-  setCurrentFrequencyIdx(0);
-#endif
-  converged_=false;
-}
+namespace allscale
+{
+namespace components
+{
 
-void localoptimizer::reset(int threads, int freq_idx){
-  threads_param_ = threads;
-  param_changes_=0;
-  thread_param_values_.clear();
-#ifdef ALLSCALE_HAVE_CPUFREQ
-  frequency_param_= freq_idx;
-  frequency_param_values_.clear();
-#endif
-  current_objective_idx_=0;
-  steps_=0;
-  current_param_=thread;
-  converged_=false;
-};
+localoptimizer::localoptimizer()
+
+		: pending_threads(0.),
+		  pending_energy(0.),
+		  pending_time(0.),
+		  pending_num_times(0.),
+		  mo_initialized(false),
+		  frequency_param_(0),
+		  converged_(false),
+		  convergence_threshold_(0.005),
+		  time_weight(0.0),
+		  energy_weight(0.0),
+		  resource_weight(0.0),
+		  nmd(0.005)
+	{
+		if (optmethod_ == random)
+			srand(std::time(NULL));
+		
+		// VV: Start with 500ms as the guestimation of max iteration time
+		objectives_scale[0] = 0.5;
+		objectives_scale[1] = 1.0;
+		objectives_scale[2] = 1.0;
+
+		nmd.set_scale(objectives_scale);
+	}
+
+double localoptimizer::evaluate_score(const double objectives[])
+{
+	if ( mo_initialized ) {
+		return nmd.evaluate_score(objectives, nullptr);
+	}
 
-#ifdef DEBUG_
-void localoptimizer::printobjectives(){
-  for(auto& el: objectives_){
-    std::cout << "Objective" << "\t\t" << "Priority" << "\t\t" << "Leeway" <<
-    std::endl;
-    switch (el.type){
-      case time:
-        std::cout << "Time" << "\t\t" << el.priority << "\t\t" << el.leeway <<
-        std::endl;
-        break;
-      case energy:
-        std::cout << "Energy" << "\t\t" << el.priority << "\t\t" << el.leeway <<
-        std::endl;
-        break;
-      case resource:
-        std::cout << "Resource" << "\t\t" << el.priority << "\t\t" << el.leeway <<
-        std::endl;
-        break;
-    }
-  }
+	return -1.0;
 }
-
-void localoptimizer::printverbosesteps(actuation act){
-  std::cout << "[INFO]";
-  if (optmethod_==random)
-    std::cout << "Random ";
-  else if (optmethod_==allscale){
-    std::cout << "Allscale ";
-  }
-  std::cout << "Scheduler Step: Setting OS Threads to " << threads_param_;
-#ifdef ALLSCALE_HAVE_CPUFREQ
-  std::cout << ", CPU Frequency to " << frequencies_param_allowed_[act.frequency_idx]
-    << std::endl;
-#else
-  std::cout << std::endl;
-#endif
-
+void localoptimizer::setobjectives(double time_weight, 
+								   double energy_weight, 
+								   double resource_weight)
+{
+	this->time_weight = time_weight;
+	this->energy_weight = energy_weight;
+	this->resource_weight = resource_weight;
+
+	// VV: Modifying the objectives triggers restarting the optimizer
+	//     from scratch
+	
+	mo_initialized = false;
+	converged_ = false;
 }
 
-#endif
+void localoptimizer::reset(int threads, int freq_idx)
+{
+	threads_param_ = threads;
+	thread_param_values_.clear();
 
-void localoptimizer::measureObjective(double iter_time, double power, double threads){
-  for(auto& el: objectives_){
-    switch (el.type){
-      case time:
-        el.samples.insert(el.samples.begin(),iter_time);
-        if (el.samples.size()>1000)
-          el.samples.resize(500);
-
-        el.threads_samples.insert(el.threads_samples.begin(),threads);
-        if (el.threads_samples.size()>1000)
-          el.threads_samples.resize(500);
-
-#ifdef ALLSCALE_HAVE_CPUFREQ
-        el.freq_samples.insert(el.freq_samples.begin(),getCurrentFrequencyIdx());
-        if (el.freq_samples.size()>1000)
-          el.freq_samples.resize(500);
-#endif
+	frequency_param_ = freq_idx;
+	converged_ = false;
+};
 
-        if (el.globalmin > iter_time){
-          el.globalmin = iter_time;
-          el.min_params_idx=param_changes_;
-        }
-        if (el.globalmax < iter_time)
-          el.globalmax = iter_time;
-#ifdef DEBUG__
-        std::cout << "Iteration Time Minimum: " << el.globalmin << std::endl;
-        std::cout << "Iteration Time Maximum: " << el.globalmax << std::endl;
-        std::cout << "Iteration Time Samples: ";
-        for(auto& samp: el.samples)
-          std::cout << samp << ",";
-        std::cout << std::endl;
-#endif
-        break;
-      case energy:
-        el.samples.insert(el.samples.begin(),power);
-        if (el.samples.size()>1000)
-          el.samples.resize(500);
-
-        el.threads_samples.insert(el.threads_samples.begin(),threads);
-        if (el.threads_samples.size()>1000)
-          el.threads_samples.resize(500);
-
-#ifdef ALLSCALE_HAVE_CPUFREQ
-        el.freq_samples.insert(el.freq_samples.begin(),getCurrentFrequencyIdx());
-        if (el.freq_samples.size()>1000)
-          el.freq_samples.resize(500);
+#ifdef DEBUG_
+void localoptimizer::printobjectives()
+{
+	std::cout << "[LocalOptimizer|DEBUG] Weights=[time:" << time_weight
+			  << ", energy:" << energy_weight
+			  << ", resource:" << resource_weight << "]" << std::endl << std::flush;
+}
 #endif
 
-        if (el.globalmin > power){
-          el.globalmin = power;
-          el.min_params_idx=param_changes_;
-        }
-        if (el.globalmax < power)
-          el.globalmax = power;
-#ifdef DEBUG__
-        std::cout << "Power Consumption Minimum: " << el.globalmin << std::endl;
-        std::cout << "Power Consumption Maximum: " << el.globalmax << std::endl;
-        std::cout << "Power Consumption Samples: ";
-        for(auto& samp: el.samples)
-          std::cout << samp << ",";
-        std::cout << std::endl;
-#endif
-        break;
-      case resource:
-        el.samples.insert(el.samples.begin(),threads);
-        if (el.samples.size()>1000)
-          el.samples.resize(500);
-
-        el.threads_samples.insert(el.threads_samples.begin(),threads);
-        if (el.threads_samples.size()>1000)
-          el.threads_samples.resize(500);
-
-#ifdef ALLSCALE_HAVE_CPUFREQ
-        el.freq_samples.insert(el.freq_samples.begin(),getCurrentFrequencyIdx());
-        if (el.freq_samples.size()>1000)
-          el.freq_samples.resize(500);
+bool localoptimizer::isConverged()
+{	
+	#if 0
+	// VV: This is an attempt to make optimization choices for 
+	//     tasks of smaller granularity (after splitting a task)
+	if ( converged_ == false ) {
+		return false;
+	}
+
+	auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+	if ( reexplore_every_ms >0 && timestamp_now - last_convergence_ts > reexplore_every_ms )
+	{	
+		std::cout << "[LOCALOPTIMIZER] Re-exploring space!" << std::endl;
+		initialize_nmd();
+	}
+	#endif 
+	return converged_; 
+}
+#ifdef DEBUG_
+void localoptimizer::printverbosesteps(actuation act)
+{
+	static int last_frequency_idx = 0;
+
+	std::cout << "[INFO]";
+	if (optmethod_ == random)
+		std::cout << "Random ";
+	else if (optmethod_ == allscale)
+	{
+		std::cout << "Allscale ";
+	}
+	std::cout << "Scheduler Step: Setting OS Threads to " << threads_param_;
+
+	if (act.frequency_idx >= 0)
+		last_frequency_idx = act.frequency_idx;
+	std::cout << " , CPU Frequency to " << frequencies_param_allowed_[last_frequency_idx]
+			  << std::endl;
+}
 #endif
 
-        if (el.globalmin > threads){
-          el.globalmin = threads;
-          el.min_params_idx=param_changes_;
-        }
-        if (el.globalmax < threads)
-          el.globalmax = threads;
-#ifdef DEBUG__
-        std::cout << "Threads Minimum: " << el.globalmin << std::endl;
-        std::cout << "Threads Maximum: " << el.globalmax << std::endl;
-        std::cout << "Threads Samples: ";
-        for(auto& samp: el.samples)
-          std::cout << samp << ",";
-        std::cout << std::endl;
-#endif
-        break;
-    }
-  }
+void localoptimizer::accumulate_objective_measurements()
+{
+	if (pending_num_times)
+	{
+		pending_time /= (double)pending_num_times;
+		pending_threads /= (double)(pending_num_times*threads_dt);
+		pending_energy /= (double)pending_num_times;
+		pending_num_times = 0;
+	}
 }
 
-actuation localoptimizer::step()
+void localoptimizer::setmaxthreads(std::size_t threads)
 {
-    steps_++;
-    actuation act;
-    act.delta_threads=threads_param_;
-#ifdef ALLSCALE_HAVE_CPUFREQ
-    act.frequency_idx=frequency_param_;
-#endif
-    /* random optimization step */
-    if (optmethod_ == random)
-    {
-        act.delta_threads = (rand() % max_threads_) - threads_param_;
-#ifdef ALLSCALE_HAVE_CPUFREQ
-        act.frequency_idx = rand() % frequencies_param_allowed_.size();
-        if (act.frequency_idx == frequency_param_)
-            act.frequency_idx = -1;
-#endif
-    }
-
-    else if (optmethod_ == allscale)
-    {
-        if (current_objective_idx_ > objectives_.size())
-  	    	return act;
+	max_threads_=threads;
+	threads_param_=threads;
+
+	#if 0
+	double threads_tick = threads / 5.;
+
+	if ( threads_tick < 1.0 )
+		threads_tick = 1.0;
+	
+	threads_dt = (int) round(threads_tick);
+	#elif 0
+	if ( max_threads_ <= 4 )
+		threads_dt = 1.;
+	else if ( max_threads_ <= 8 )
+		threads_dt = 2.;
+	else if ( max_threads_ <= 32 )
+		threads_dt = 4.;
+	else
+		threads_dt = 8.;
+	#else 
+		threads_dt = 1.;
+	#endif
+	
+	if ( mo_initialized ) {
+		if ( converged_ == false ) {
+			initialize_nmd(true);
+		} else {
+			double factor;
+			int min_freq = 0;
+			int max_freq = frequencies_param_allowed_.size() - 1;
+
+			if ( time_weight >= energy_weight + resource_weight) {
+				factor = 0.5;
+				min_freq = frequencies_param_allowed_.size() / 4;
+			}		
+			else {
+				factor = 0.25;
+				max_freq = max_freq / 2;
+			}
+
+			int min_threads = factor * max_threads_/((double)threads_dt);
+
+			if ( min_threads < 1 )
+				min_threads = 1;
+			
+			double constraint_min[] = {(double) min_threads, (double) min_freq};
+			#if defined(ALLSCALE_HAVE_CPUFREQ)
+			double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
+									(double)max_freq};
+			#else 
+			std::cout << "Allowed frequencies: " << frequencies_param_allowed_.size() << std::endl;
+			double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
+									0.0};
+			#endif
+
+			nmd.update_constraints(constraint_min, constraint_max);
+		}
+	}
+}
 
-        if (steps_ < warmup_steps_)
-        {
+void localoptimizer::initialize_nmd(bool from_scratch)
+{
+	// VV: Place constraints to #threads and cpu_freq tunable knobs
+	double factor;
+	int min_freq = 0;
+	int max_freq = frequencies_param_allowed_.size() - 1;
+
+	if ( time_weight >= energy_weight + resource_weight) {
+		factor = 0.5;
+		min_freq = frequencies_param_allowed_.size() / 4;
+	}		
+	else {
+		factor = 0.25;
+		max_freq = max_freq / 2;
+	}
+
+	int min_threads = factor * max_threads_/((double)threads_dt);
+
+	if ( min_threads < 1 )
+		min_threads = 1;
+	int max_threads = max_threads_;
+
+	double constraint_min[] = { (double) min_threads, (double) min_freq};
+	#if defined(ALLSCALE_HAVE_CPUFREQ)
+	double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
+							(double)max_freq};
+	#else 
+	std::cout << "Allowed frequencies: " << frequencies_param_allowed_.size() << std::endl;
+	double constraint_max[] = {ceil(max_threads_/(double)threads_dt),
+							0.0};
+	#endif
+	const double opt_weights[] = { time_weight, energy_weight, resource_weight };
+
+	nmd.set_scale(objectives_scale);
+
+	if( from_scratch == false ){
+		double prev_simplex[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS];
+	
+		nmd.get_simplex(prev_simplex);
+
+		nmd.initialize_simplex(opt_weights,
+								prev_simplex,
+								constraint_min, 
+								constraint_max);
+	} else {
+		if ( time_weight >= energy_weight + resource_weight ) {
+			double initial_simplex[3][2] = {
+				{(double) min_threads, constraint_min[1]},
+				{max_threads/2.0, (constraint_min[1]+constraint_max[1])/2.0},
+				{(min_threads+max_threads)/2., constraint_max[1]}
+			};
+			nmd.initialize_simplex(opt_weights,
+									initial_simplex,
+									constraint_min, 
+									constraint_max);
+		} else {
+			double initial_simplex[3][2] = {
+				{(double) min_threads, constraint_min[1]},
+				{max_threads/2.0, (constraint_min[1]+constraint_max[1])/2.0},
+				{(min_threads+max_threads)/2., constraint_max[1]}
+			};
+
+			nmd.initialize_simplex(opt_weights,
+									initial_simplex,
+									constraint_min, 
+									constraint_max);
+		}
+	}
+
+	mo_initialized = true;
+	explore_knob_domain = true;
+	converged_ = false;
+}
 
-#ifdef DEBUG_MULTIOBJECTIVE_
-            std::cout << "[LOCALOPTIMIZER|INFO] Optimizer No-OP: either at warm-up or optimizer has completed\n";
-#endif
-            // set some random parametrization to collect at least 3 different
-            // vertices to be used as input to the optimizer
-    	    act.delta_threads = rand() % max_threads_;
-#ifdef ALLSCALE_HAVE_CPUFREQ
-    	    act.frequency_idx = rand() % frequencies_param_allowed_.size();
-#endif
-            return act;
-        }
+void localoptimizer::set_objectives_scale(const double objectives_scale[3]) 
+{
+	for (auto i=0ul; i<NMD_NUM_OBJECTIVES; ++i )
+		this->objectives_scale[i] = objectives_scale[i];
+	
+	nmd.set_scale(objectives_scale);
+}
 
-        // iterate over all objectives in decreasing priority
-        objective obj = objectives_[current_objective_idx_];
+void localoptimizer::measureObjective(double iter_time, double power, double threads)
+{
+	// VV: iter_time has no bound, threads has bound @max_threads_
+	//     and power 1.0
+
+	std::cout << "Measuring objective: "
+			  << iter_time << " "
+			  << power << " "
+			  << threads << std::endl;
+	if ( objectives_scale[0] < iter_time ) {
+		objectives_scale[0] = iter_time * 1.1;
+		set_objectives_scale(objectives_scale);
+	}
+
+	pending_time += iter_time;
+	pending_energy += power;
+	pending_threads += threads / max_threads_;
+	pending_num_times++;
+}
 
-        // initialize optimizer for this objective, if not already done so
-        if (!obj.initialized)
-        {
-#ifdef DEBUG_MULTIOBJECTIVE_
-            std::cout << "[LOCALOPTIMIZER|INFO] Initializing optimizer for new objective\n";
-	        std::cout << "[LOCALOPTIMIZER|DEBUG] Samples: " << std::flush;
-	        for (auto& sam: obj.samples)
-            {
-	            std::cout << sam << "," << std::flush;
-	        }
-            std::cout << "\n" << std::flush;
-
-            std::cout << "[LOCALOPTIMIZER|DEBUG] Thread Param of Samples: " << std::flush;
-            for (auto& sam: obj.threads_samples)
-            {
-                std::cout << sam << "," << std::flush;
-            }
-            std::cout << "\n" << std::flush;
-
-#ifdef ALLSCALE_HAVE_CPUFREQ
-            std::cout << "[LOCALOPTIMIZER|DEBUG] Freq Param of Samples: " << std::flush;
-            for (auto& sam: obj.freq_samples){
-                std::cout << sam << "," << std::flush;
-            }
-            std::cout << "\n" << std::flush;
-#endif
-#endif
-            int samplenr = obj.samples.size();
-#ifdef ALLSCALE_HAVE_CPUFREQ
-            double params[3][2]={
-                {obj.threads_samples[samplenr-1],obj.freq_samples[samplenr-1]},
-                {obj.threads_samples[samplenr-2],obj.freq_samples[samplenr-2]},
-                {obj.threads_samples[samplenr-3],obj.freq_samples[samplenr-3]},
-            };
-            double values[3]={obj.samples[samplenr-1],obj.samples[samplenr-2],obj.samples[samplenr-3]};
-
-            double constraint_min[]={1,0};
-            double constraint_max[]={(double)max_threads_,
-                (double)frequencies_param_allowed_.size()};
-
-            nmd.initialize_simplex(params,values,constraint_min,constraint_max);
-            objectives_[current_objective_idx_].initialized=true;
-#endif
-        }
+void localoptimizer::reset_accumulated_measurements()
+{
+	pending_time = 0.;
+	pending_energy = 0.;
+	pending_threads = 0.;
+	pending_num_times = 0;
+}
 
-#ifdef DEBUG_MULTIOBJECTIVE_
-        std::cout << "[LOCALOPTIMIZER|DEBG] Current Optimized Objective =";
-        switch (obj.type)
-        {
-            case energy:
-                std::cout << "********** Energy\n" << std::flush;
-                break;
-            case time:
-                std::cout << "&&&&&&&&&& Time\n" << std::flush;
-                break;
-            case resource:
-                std::cout << "oooooooooo Resource\n" << std::flush;
-                break;
-        }
-        std::cout << "[LOCALOPTIMIZER|DEBUG] Samples: " << std::flush;
-        for (auto& sam: obj.samples)
-        {
-            std::cout << sam << "," << std::flush;
-        }
-        std::cout << "\n" << std::flush;
-
-        std::cout << "[LOCALOPTIMIZER|DEBUG] Freq Param of Samples: " << std::flush;
-#ifdef ALLSCALE_HAVE_CPUFREQ
-        for (auto& sam: obj.freq_samples)
-        {
-            std::cout << sam << "," << std::flush;
-        }
-        std::cout << "\n" << std::flush;
-#endif
-#endif
+actuation localoptimizer::step(std::size_t active_threads)
+{
+	actuation act;
+	// VV: Possibly amend erroneous information
+	threads_param_  = active_threads;
+	act.threads = threads_param_;
+
+	act.frequency_idx = frequency_param_;
+
+	/* random optimization step */
+	if (optmethod_ == random)
+	{
+		act.threads = (rand() % max_threads_);
+		act.frequency_idx = rand() % frequencies_param_allowed_.size();
+	}
+	else if (optmethod_ == allscale)
+	{
+		// VV: Keep track of dirty objectives
+		if (mo_initialized == false)
+			initialize_nmd(true);
+				
+		accumulate_objective_measurements();
+		const double latest_measurements[] = {pending_time, 
+											pending_energy, 
+											pending_threads};
+		reset_accumulated_measurements();
+
+		if ( converged_ == false ){
+			optstepresult nmd_res = nmd.step(latest_measurements,
+											 active_threads,
+											 frequency_param_);
 
-        optstepresult nmd_res = nmd.step(obj.samples[0]);
 #ifdef DEBUG_MULTIOBJECTIVE_
-        std::cout << "[LOCALOPTIMIZER|DEBUG] Calling NMD Optimizer Step, Param = \n";
-        std::cout << "[LOCALOPTIMIZER|DEBUG] New Vertex to try: ";
-        std::cout << "Threads = " << nmd_res.threads;
-#ifdef ALLSCALE_HAVE_CPUFREQ
-        std::cout << " Freq Idx = " << nmd_res.freq_idx << std::endl;
-#endif
-        std::cout << "Converg Thresh = " << convergence_threshold_ << std::endl;
-#endif
-        if (nmd_res.converged)
-        {
-            objectives_[current_objective_idx_].converged = true;
-            objectives_[current_objective_idx_].converged_minimum = nmd.getMinObjective();
-            double* minimization_point = nmd.getMinVertices();
-            objectives_[current_objective_idx_].minimization_params[0]=
-                minimization_point[0];
-            objectives_[current_objective_idx_].minimization_params[1]=
-                minimization_point[1];
-#ifdef DEBUG_CONVERGENCE_
-            std::cout << "[LOCALOPTIMIZER|INFO] NMD convergence\n";
-            std::cout << "******************************************" << std::endl;
-            std::cout << "[LOCALOPTIMIZER|INFO] Minimal Objective Value = " <<
-                objectives_[current_objective_idx_].converged_minimum <<
-                "Threads = " << minimization_point[0] << "Freq_idx = " << minimization_point[1] <<
-                std::endl;
-            std::cout << "******************************************" << std::endl;
-#endif
-            act.delta_threads=minimization_point[0];
-#ifdef ALLSCALE_HAVE_CPUFREQ
-            act.frequency_idx=minimization_point[1];
+			std::cout << "[LOCALOPTIMIZER|DEBUG] New Vertex to try:";
+			std::cout << " Threads = " << nmd_res.threads;
+			std::cout << " Freq Idx = " << nmd_res.freq_idx << std::endl;
+			std::cout << " Converge Thresh = " << convergence_threshold_ << std::endl;
 #endif
-            current_objective_idx_++;
-            if (current_objective_idx_ == objectives_.size())
-            {
-                converged_=true;
-#ifdef DEBUG_CONVERGENCE_
-                std::cout << "[LOCALOPTIMIZER|INFO] ALL OBJECTIVES HAVE CONVERGED " << std::endl;
-#endif
-            }
-        }
-        else
-        {
-            // if a higher priority objective starts getting off leeway margin,
-            // decide convergence of the current param at this parameter point
-            if (current_objective_idx_>0)
-                for (int i=0;i<current_objective_idx_;i++)
-                {
-                    objective priority_obj=objectives_[i];
-                    double max_leeway_value = priority_obj.converged_minimum +
-                        priority_obj.leeway*(priority_obj.globalmax - priority_obj.converged_minimum);
-                    if (priority_obj.samples[0] > max_leeway_value &&
-                            priority_obj.samples[1] > max_leeway_value)
-                    {
-                        objectives_[current_objective_idx_].converged = true;
-                        objectives_[current_objective_idx_].converged_minimum = nmd.getMinObjective();
-                        double* minimization_point = nmd.getMinVertices();
-                        objectives_[current_objective_idx_].minimization_params[0]=
-                            minimization_point[0];
-                        objectives_[current_objective_idx_].minimization_params[1]=
-                            minimization_point[1];
+			if (nmd_res.converged)
+			{
+				double min_score = nmd.getMinObjective();
+				double *minimization_point = nmd.getMinVertices();
 
 #ifdef DEBUG_CONVERGENCE_
-                        std::cout << "[LOCALOPTIMIZER|INFO] Leeway convergence\n";
-                        std::cout << "******************************************" << std::endl;
-                        std::cout << "[LOCALOPTIMIZER|INFO] Minimal Objective Value = " <<
-                            objectives_[current_objective_idx_].converged_minimum <<
-                            "Threads = " << minimization_point[0] << "Freq_idx = " << minimization_point[1] <<
-                            std::endl;
-                        std::cout << "******************************************" << std::endl;
-#endif
-                        // find the parameter point that scores the leeway margin value
-						act.delta_threads = (int)priority_obj.minimization_params[0]*
-                            (max_leeway_value/priority_obj.converged_minimum);
-#ifdef ALLSCALE_HAVE_CPUFREQ
-                        act.frequency_idx = (int)priority_obj.minimization_params[1]*
-                            (max_leeway_value/priority_obj.converged_minimum);
-#endif
-                        //act.delta_threads=minimization_point[0];
-			            //act.frequency_idx=minimization_point[1];
-			            current_objective_idx_++;
-			            if (current_objective_idx_ == objectives_.size())
-                        {
-                            converged_=true;
-#ifdef DEBUG_CONVERGENCE_
-                            std::cout << "[LOCALOPTIMIZER|INFO] ALL OBJECTIVES HAVE CONVERGED " << std::endl;
+				std::cout << "[LOCALOPTIMIZER|INFO] NMD convergence\n";
+				std::cout << "******************************************" << std::endl;
+				std::cout << "[LOCALOPTIMIZER|INFO] Minimal Objective Value = " << min_score << " Threads = " << minimization_point[0] << " Freq_idx = " << minimization_point[1] << std::endl;
+				std::cout << "******************************************" << std::endl;
 #endif
-                        }
-                        return act;
-                    }
-    		}
-            act.delta_threads=(nmd_res.threads==0)?getCurrentThreads():nmd_res.threads;
-#ifdef ALLSCALE_HAVE_CPUFREQ
-            act.frequency_idx=nmd_res.freq_idx;
+				act.threads = minimization_point[0];
+				act.frequency_idx = minimization_point[1];
+				
+				// VV: Stop searching for new knob_set
+				converged_ = true;
+			} else {
+				// VV: Have not converged yet, keep exploring
+				act.threads = nmd_res.threads;
+				act.frequency_idx = nmd_res.freq_idx;
+			}
+			
+			act.threads *= threads_dt;
+
+			threads_param_ = act.threads;
+#ifdef DEBUG_MULTIOBJECTIVE_
+			std::cout << "[LOCALOPTIMIZER|DEBUG] ACTUAL Vertex to try:";
+			std::cout << " Threads = " << act.threads;
+			std::cout << " Freq Idx = " << act.frequency_idx << std::endl;
 #endif
-        }
-    }
-    return act;
-}
-}
+		}
+	}
+validate_act:
+
+	if (act.threads > max_threads_)
+	{
+		act.threads = max_threads_;
+	}
+	else if (act.threads < 1)
+	{
+		act.threads = getCurrentThreads();
+	}
+
+	// VV: If freq_idx is -1 then set it to last used frequency (frequency_param_)
+	if (act.frequency_idx < 0)
+		act.frequency_idx = frequency_param_;
+	else if (act.frequency_idx > frequencies_param_allowed_.size() - 1)
+		act.frequency_idx = frequencies_param_allowed_.size() - 1;
+
+	threads_param_ = act.threads;
+	frequency_param_ = act.frequency_idx;
+
+	return act;
 }
+} // namespace components
+} // namespace allscale
diff --git a/src/components/monitor_component.cpp b/src/components/monitor_component.cpp
index 3b6e4b5..5ae6463 100644
--- a/src/components/monitor_component.cpp
+++ b/src/components/monitor_component.cpp
@@ -26,6 +26,11 @@
 
 #include <hpx/lcos/gather.hpp>
 
+#ifdef ALLSCALE_HAVE_CPUFREQ
+#define POWER_MEASUREMENT_PERIOD_MS 100
+#include <allscale/util/hardware_reconf.hpp>
+#endif
+
 #ifdef HAVE_PAPI
 #include <boost/tokenizer.hpp>
 #include <string.h>
@@ -353,16 +358,55 @@ namespace allscale { namespace components {
 
    float monitor::get_current_power()
    {
+      #ifdef ALLSCALE_HAVE_CPUFREQ
+      /*VV: Read potentially multiple measurements of power within the span of 
+            POWER_MEASUREMENT_PERIOD_MS milliseconds. Each time this function
+            is invoked it returns the running average of power.*/
+      static mutex_type power_mtx;
+      static unsigned long long times_read_power=0;
+      static unsigned long long power_sum = 0ull;
+      static long timestamp_reset_power = 0;
+
+      int64_t t_now, dt;
+      float ret;
+
+      std::lock_guard<mutex_type> lock(power_mtx);
+      
+      t_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+ 
+      dt = t_now - timestamp_reset_power;
+      times_read_power ++;
+
+      power_sum += util::hardware_reconf::read_system_power();
+
+      ret = power_sum / (float)(times_read_power);
+
+      if ( dt >= POWER_MEASUREMENT_PERIOD_MS ) {
+            times_read_power = 0;
+            power_sum = 0ull;
+            timestamp_reset_power = t_now;
+      }
+
+      return ret;
+      #else
       return allscale::power::estimate_power(get_current_freq(0)) * num_cpus_;
+      #endif
    }
 
 
    float monitor::get_max_power()
    {
-#ifdef POWER_ESTIMATE
+#if defined(ALLSCALE_HAVE_CPUFREQ)
+      // VV: report 1100 Watts
+      //  ( redbox paper 5283 for 8335-GTA indicates 1875 for the 
+      //   whole node but I've noticed up to ~1100 Watts, for
+      //   the time being this is a good enough figure )
+      //  ( this should be dynamically configured/discovered )
+      return 1100.0;
+#elif defined(POWER_ESTIMATE)
       return allscale::power::estimate_power(get_max_freq(0)) * num_cpus_;
 #else
-      return 0.0;
+      return 125.0;
 #endif
    }
 
diff --git a/src/components/nmd.cpp b/src/components/nmd.cpp
new file mode 100644
index 0000000..bb59b1a
--- /dev/null
+++ b/src/components/nmd.cpp
@@ -0,0 +1,873 @@
+#include <iostream>
+#include <chrono>
+#include <cstdlib>
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <allscale/components/nmd.hpp>
+
+
+//#define NMD_DEBUG_
+//#define NMD_INFO_
+
+#ifdef NMD_DEBUG_
+#define OUT_DEBUG(X) X
+#ifndef NMD_INFO_
+    #define NMD_INFO_
+#endif
+#else
+#define OUT_DEBUG(X) {}
+#endif
+
+#if defined(NMD_INFO_)
+#define OUT_INFO(X) X
+#else
+#define OUT_INFO(X) {}
+#endif
+
+
+using namespace allscale::components;
+
+NmdGeneric::NmdGeneric()
+:
+current_state(warmup), warmup_step(0), 
+conv_threshold(0), num_knobs(0), num_objectives(0), 
+scores(nullptr), simplex(nullptr), initial_config(nullptr),
+constraint_max(nullptr), constraint_min(nullptr),
+point_reflect(nullptr), point_contract(nullptr), weights(nullptr)
+{}
+
+NmdGeneric::NmdGeneric(std::size_t num_knobs, 
+                        std::size_t num_objectives, 
+                        double conv_threshold,
+                        int64_t cache_expire_dt_ms,
+                        std::size_t max_iters)
+: conv_threshold(conv_threshold), num_knobs(num_knobs), 
+num_objectives(num_objectives), 
+cache_expire_dt_ms(cache_expire_dt_ms),
+final_explore(false),
+max_iters(max_iters)
+{
+    scores = new double [num_knobs+1];
+    centroid = new std::size_t [num_knobs];
+    simplex = new std::size_t* [num_knobs+1];
+    initial_config = new std::size_t* [num_knobs+1];
+
+    for (auto i=0ul; i<num_knobs+1; ++i) {
+        simplex[i] = new std::size_t [num_knobs];
+        initial_config[i] = new std::size_t [num_knobs];
+    }
+
+    constraint_max = new std::size_t [num_knobs];
+    constraint_min = new std::size_t [num_knobs];
+
+    point_reflect = new std::size_t [num_knobs];
+    point_contract = new std::size_t [num_knobs];
+    point_expand = new std::size_t [num_knobs];
+
+    weights = new double [num_objectives];
+}
+
+double NmdGeneric::score(const double measurements[]) const
+{
+    return (*score_function)(measurements, weights);
+}
+
+void NmdGeneric::initialize(const std::size_t constraint_min[], 
+                            const std::size_t constraint_max[],
+                            const std::size_t *initial_config[], 
+                            const double weights[], double (*score_function)(const double[], const double []))
+{
+    for (auto i=0ul; i<num_objectives; ++i)
+        this->weights[i] = weights[i];
+    
+    this->score_function = score_function;
+
+    set_constraints_now(constraint_min, constraint_max);
+
+    iteration = 0;
+    if ( initial_config == nullptr ) {
+        std::set<std::vector<std::size_t> > fake;
+        
+        OUT_INFO(
+            std::cout << "[NMD|Info] Generating initial config for " << num_knobs << std::endl;
+        )
+
+        for (auto i=0ul; i<num_knobs+1; ++i ) {
+            for ( auto j=0ul; j<num_knobs; ++j ) {
+                auto width = constraint_max[j] - constraint_min[j] + 1;
+                this->initial_config[i][j] = std::rand() % width + constraint_min[j];
+            }
+
+            generate_unique(this->initial_config[i], false, &fake);
+            auto new_key = std::vector<std::size_t>();
+            new_key.assign(this->initial_config[i], this->initial_config[i]+num_knobs);
+            fake.insert(new_key);
+        }
+    } else {
+        for (auto i=0ul; i<num_knobs+1; ++i )
+            for (auto j=0ul; j<num_knobs; ++j )
+                this->initial_config[i][j] = initial_config[i][j];
+    }
+
+    current_state = warmup;
+    warmup_step = 0;
+
+    OUT_INFO(
+        for (auto i=0ul; i<num_knobs+1; ++i ) {
+            std::cout << "[NMD|Info] Initial config " << i << " : ";
+            for (auto j=0ul; j<num_knobs; ++j )
+                std::cout << this->initial_config[i][j] << " ";
+            std::cout << std::endl;
+        }
+    )   
+
+    final_explore = false;
+    times_reentered_start = 0;
+}
+
+void NmdGeneric::set_constraints_now(const std::size_t constraint_min[],
+                                    const std::size_t constraint_max[])
+{
+    for (auto i=0ul; i<num_knobs; ++i ){
+        this->constraint_max[i] = constraint_max[i];
+        this->constraint_min[i] = constraint_min[i];
+    }
+}
+
+void NmdGeneric::generate_unique(std::size_t initial[], bool accept_stale=false, 
+                                const std::set<std::vector<std::size_t> > *extra=nullptr) const
+{
+    const auto ts_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    auto explored = (std::size_t) std::count_if(cache.begin(), cache.end(), [ts_now, accept_stale](const auto &entry) {
+        auto dt = ts_now - entry.second.cache_ts;
+        return accept_stale || dt < entry.second.cache_dt;
+    });
+
+    auto max_comb = compute_max_combinations();
+
+    if ( max_comb > explored && max_comb - explored > 1 ) {
+        // VV: TODO Optimize check_novel(). Currently, large "max_distance" values 
+        //     may result in extreme overheads
+        const auto max_distance = 3ul;
+        int64_t temp[num_knobs];
+        std::set< std::vector<std::size_t> > candidates;
+
+        auto check_novel = [this, &ts_now, &candidates, &accept_stale, &extra](int64_t knobs[]) mutable -> void {
+            apply_constraint(knobs);
+
+            auto key = std::vector<std::size_t>();
+
+            key.assign(knobs, knobs+num_knobs);
+            auto entry = cache.find(key);
+            if ( extra == nullptr || extra->find(key) == extra->end()) {
+                if ( entry == cache.end() ) {
+                    candidates.insert(key);
+                } else {
+                    std::cout << "Found ";
+                    for (auto i=0ul; i<num_knobs; ++i ) {
+                        std::cout << key[i] << " ";
+                    }
+                    std::cout << std::endl;
+                    auto dt = ts_now - entry->second.cache_ts;
+                    if (accept_stale==false || 
+                        (dt >= entry->second.cache_dt && cache_expire_dt_ms > 0) ) {
+                        candidates.insert(key);
+                    }
+                }
+            }
+        };
+
+        auto counters = std::vector<std::size_t>(num_knobs, 0ul);
+
+        bool done = false;
+
+        while ( done == false ) {
+            // VV: Generate all possible permutations
+            auto ops = std::string(num_knobs, '0');
+            do{
+                for ( auto j=0ul; j<num_knobs; ++j ) {
+                    temp[j] = ops[j] == '0' ? initial[j] + counters[j] :
+                                        initial[j] - counters[j];
+                }
+                check_novel(temp);
+            } while (next_binary(ops.begin(), ops.end()));
+
+            // VV: Increase inner-most loop and see if the whole process is terminated or not
+            counters[0] += 1;
+
+            for ( auto i=0ul; i<num_knobs-1; ++i ) {
+                if ( (counters[i] > constraint_max[i] - constraint_min[i] +1) ||
+                    (counters[i] > max_distance) ) {
+                    counters[i] = 0;
+                    counters[i+1] += 1;
+                }
+            }
+
+            if ( (counters[num_knobs-1] > 
+                    constraint_max[num_knobs-1] - constraint_min[num_knobs-1] +1)
+                || (counters[num_knobs-1] > max_distance))
+                done = true;
+        }
+
+        // std::cout << "Step " << candidates.size() << std::endl;
+
+        std::vector< std::vector<std::size_t> > sorted;
+
+        sorted.assign(candidates.begin(), candidates.end());
+        candidates.clear();
+
+        std::sort(sorted.begin(), sorted.end(), 
+            [initial](const auto &e1, const auto &e2) mutable -> int {
+                int64_t t;
+                std::size_t d1=0ul, d2=0ul;
+
+                for (auto i=0ul; i<e1.size(); ++i) {
+                    t = (int64_t)e1[i] - (int64_t)initial[i];
+                    d1 += t*t;
+
+                    t = (int64_t)e2[i] - (int64_t)initial[i];
+                    d2 += t*t;
+                }
+
+                return d1 < d2;
+            });
+        for (auto i=0ul; i<num_knobs; ++i)
+            initial[i] = sorted[0][i];
+    }
+}
+
+std::size_t NmdGeneric::compute_max_combinations() const
+{
+    if ( constraint_max == nullptr || constraint_min == nullptr ) {
+        return 0ul;
+    }
+    std::size_t combinations = 1;
+
+    for ( auto i=0ul; i<num_knobs; ++i )
+        combinations += constraint_max[i] - constraint_min[i] +1;
+    
+    return combinations;
+}
+
+void NmdGeneric::ensure_profile_consistency(std::size_t expected[], 
+    const std::size_t observed[]) const
+{
+    bool same = true;
+
+    for (auto i=0ul; i<num_knobs; ++i)
+        if (expected[i] != observed[i])
+            same = false;
+    if ( same == false ) {
+        OUT_INFO(
+            std::cout << "[NMD|Info] Profile does not match last suggestion, will correct: ";
+        )
+        
+        for (auto i=0ul; i<num_knobs; ++i) 
+            std::cout << expected[i] << " ";
+        
+        std::cout << " -- ";
+
+        for (auto i=0ul; i<num_knobs; ++i) 
+            std::cout << observed[i] << " ";
+        std::cout << std::endl;
+
+        for (auto i=0ul; i<num_knobs; ++i) 
+            expected[i] = observed[i];
+    }
+}
+
+void NmdGeneric::compute_centroid()
+{
+    double c[num_knobs];
+    
+    for (auto i=0ul; i<num_knobs; ++i )
+    {
+        c[i] = 0.0;
+
+        for (auto j=0ul; j<num_knobs; ++j)
+            c[i] += simplex[i][j];
+        
+        c[i] = round(c[i]/(double) num_knobs);
+    }
+    apply_constraint(c);
+
+    for (auto i=0ul; i<num_knobs; ++i)
+        centroid[i] = (std::size_t) c[i];
+}
+
+void NmdGeneric::sort_simplex(bool consult_cache)
+{
+    auto key = std::vector<std::size_t>();
+    const auto ts_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    for ( auto i = 0ul; i<num_knobs+1; ++i) {
+        key.assign(simplex[i], simplex[i]+num_knobs);
+        auto entry = cache.find(key);
+        logistics p = entry->second;
+        p.cache_ts = ts_now;
+        p.cache_dt = cache_expire_dt_ms;
+        entry->second = p;
+    }
+
+    OUT_DEBUG(
+        std::cout << "CACHE ENTRIES: "<<cache.size() << std::endl;
+        for (const auto &e:cache) {
+            for (auto i=0ul; i<num_knobs; ++i)
+                std::cout << e.second.knobs[i] << " ";
+            std::cout << ": ";
+            for (auto i=0ul; i<num_objectives; ++i)
+                std::cout << e.second.objectives[i] << " ";
+            std::cout << " = " << score(e.second.objectives.data());
+            std::cout << std::endl;
+        }
+    )
+    
+    std::vector<logistics> fresh;
+    if ( consult_cache ) {
+        for (const auto &e:cache )
+            if (ts_now - e.second.cache_ts < e.second.cache_dt)
+                fresh.push_back(e.second);
+    
+        std::sort(fresh.begin(), fresh.end(), 
+            [this](const auto &e1, const auto &e2) mutable ->int {
+                return this->score(e1.objectives.data()) < this->score(e2.objectives.data());
+            });
+    }
+
+    if ( fresh.size() >= num_knobs+1) {
+        for (auto i=0ul; i<num_knobs+1; ++i) {
+            memcpy(simplex[i], fresh[i].knobs.data(), sizeof(std::size_t)*num_knobs);
+            scores[i] = score(fresh[i].objectives.data());
+        }
+    } else {
+        std::vector< std::pair<double, std::vector<std::size_t> > > plain;
+        for ( auto i=0ul; i<num_knobs+1; ++i ) {
+            key.assign(simplex[i], simplex[i] + num_knobs);
+            plain.push_back( std::make_pair(scores[i], key));
+        }
+
+        std::sort(plain.begin(), plain.end(), 
+        [](const auto &e1, const auto &e2) mutable ->int {
+            return e1.first < e2.first;
+        });
+
+        for (auto i=0ul; i<num_knobs+1; ++i) {
+            memcpy(simplex[i], plain[i].second.data(), sizeof(std::size_t)*num_knobs);
+            scores[i] = plain[i].first;
+        }
+    }
+}
+
+std::vector<std::size_t> NmdGeneric::do_start(bool consult_cache=true)
+{
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  INNER start" << std::endl;
+    )
+    iteration ++;
+    sort_simplex(false);
+    compute_centroid();
+    double temp[num_knobs];
+
+    OUT_INFO(
+        std::cout << "[NMD|Info] Initial simplex" << std::endl;
+        for ( auto i=0ul; i<num_knobs+1; ++i) {
+            std::cout << "[NMD|Info] Score " << scores[i];
+            for ( auto j=0ul; j<num_knobs; ++j)
+                std::cout << " " << simplex[i][j];
+            std::cout << std::endl;
+        }
+
+        std::cout << "[NMD|Info] Centroid: ";
+        for (auto i=0ul; i<num_knobs; ++i)
+            std::cout << centroid[i] << " ";
+        std::cout << std::endl;
+    )
+    for (auto i=0ul; i<num_knobs; ++i)
+        temp[i] = centroid[i] + ALPHA * (centroid[i] - (double)simplex[num_knobs][i]);
+    
+    apply_constraint(temp);
+
+    for ( auto i=0ul; i<num_knobs; ++i)
+        point_reflect[i] = temp[i];
+    
+    generate_unique(point_reflect, false);
+
+    auto key = std::vector<std::size_t>();
+    key.assign(point_reflect, point_reflect + num_knobs);
+
+    auto entry = cache.find(key);
+
+    current_state = reflect;
+
+    if ( entry != cache.end() 
+        && times_reentered_start++ < 5
+        && iteration < max_iters ) {
+        auto ts_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+        if ( ts_now - entry->second.cache_ts < entry->second.cache_dt ) {
+            return do_reflect(entry->second.objectives.data(), entry->second.knobs.data());
+        }
+    }
+    
+    return key;
+}
+
+std::vector<std::size_t> NmdGeneric::do_shrink()
+{
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  INNER shrink" << std::endl;
+    )
+
+    std::set<std::vector<std::size_t> > fake;
+    std::vector<std::size_t> key;
+    
+    for ( auto i=0ul; i<num_knobs+1; ++i ) {
+        double temp[num_knobs];
+
+        for ( auto j=0ul; j<num_knobs; ++j ) {
+            temp[j] = centroid[j] + DELTA * ((double)simplex[i][j] - (double)centroid[j]);
+        }
+
+        apply_constraint(temp);
+
+        for ( auto j=0ul; j<num_knobs; ++j)
+            initial_config[i][j] = temp[j];
+        
+        generate_unique(initial_config[i], false, &fake);
+
+        key.assign(initial_config[i], initial_config[i]+num_knobs);
+        fake.insert(key);
+    }
+    
+    current_state = warmup;
+    warmup_step = 0;
+
+    OUT_INFO(
+        for (auto i=0ul; i<num_knobs+1; ++i ) {
+            std::cout << "[NMD|Info] Shrank simplex " << i << " : ";
+            for (auto j=0ul; j<num_knobs; ++j )
+                std::cout << this->initial_config[i][j] << " ";                
+            std::cout << std::endl;
+        }
+    )
+
+    return do_warmup({}, {});
+}
+
+std::vector<std::size_t> NmdGeneric::do_contract_out(const double measurements[], 
+                            const std::size_t observed_knobs[])
+{
+    ensure_profile_consistency(point_contract, observed_knobs);
+    score_contract = score(measurements);
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  INNER ContractOUT: ";
+        for (auto i=0ul; i<num_knobs; ++i)
+            std::cout << point_contract[i] << " ";
+        std::cout << ":" << score_contract << std::endl;
+    )
+
+    auto ts_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    logistics entry;
+    entry.knobs.assign(observed_knobs, observed_knobs+num_knobs);
+    entry.objectives.assign(measurements, measurements+num_objectives);
+    entry.cache_dt = cache_expire_dt_ms;
+    entry.cache_ts = ts_now;
+
+    cache[entry.knobs] = entry;
+
+    if ( score_contract <= score_reflect ){
+        // VV: foc <= fr then replace v[n] with voc
+
+        for (auto i=0ul; i<num_knobs; ++i)
+            simplex[num_knobs][i] = point_contract[i];
+        
+        scores[num_knobs] = score_contract;
+        current_state = start;
+        return do_start(true);
+    } else {
+        current_state = shrink;
+        return do_shrink();
+    }
+}
+
+std::vector<std::size_t> NmdGeneric::do_contract_in(const double measurements[], 
+                            const std::size_t observed_knobs[])
+{
+    ensure_profile_consistency(point_contract, observed_knobs);
+    score_contract = score(measurements);
+    
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  INNER ContractIN: ";
+        for (auto i=0ul; i<num_knobs; ++i)
+            std::cout << point_contract[i] << " ";
+        std::cout << ":" << score_contract << std::endl;
+    )
+
+    auto ts_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    logistics entry;
+    entry.knobs.assign(observed_knobs, observed_knobs+num_knobs);
+    entry.objectives.assign(measurements, measurements+num_objectives);
+    entry.cache_dt = cache_expire_dt_ms;
+    entry.cache_ts = ts_now;
+
+    cache[entry.knobs] = entry;
+
+    if ( score_contract < scores[num_knobs] ){
+        // VV: fic < f[n] then replace v[n] with vic
+
+        for (auto i=0ul; i<num_knobs; ++i)
+            simplex[num_knobs][i] = point_contract[i];
+        scores[num_knobs] = score_contract;
+        current_state = start;
+        return do_start(true);
+    } else {
+        current_state = shrink;
+        return do_shrink();
+    }
+}
+
+
+std::vector<std::size_t> NmdGeneric::do_expand(const double measurements[], 
+                            const std::size_t observed_knobs[])
+{
+    ensure_profile_consistency(point_expand, observed_knobs);
+    score_expand = score(measurements);
+
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  INNER Expand: ";
+        for (auto i=0ul; i<num_knobs; ++i)
+            std::cout << point_expand[i] << " ";
+        std::cout << ":" << score_expand << std::endl;
+    )
+
+    auto ts_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    logistics entry;
+    entry.knobs.assign(observed_knobs, observed_knobs+num_knobs);
+    entry.objectives.assign(measurements, measurements+num_objectives);
+    entry.cache_dt = cache_expire_dt_ms;
+    entry.cache_ts = ts_now;
+
+    cache[entry.knobs] = entry;
+
+    if ( score_expand < score_reflect ){
+        // VV: fe < fr then replace v[n] with ve
+        for (auto i=0ul; i<num_knobs; ++i)
+            simplex[num_knobs][i] = point_expand[i];
+        scores[num_knobs] = score_expand;
+    } else {
+        // VV: fr <= fe then replace v[n] with vr
+        for (auto i=0ul; i<num_knobs; ++i)
+            simplex[num_knobs][i] = point_reflect[i];
+        scores[num_knobs] = score_reflect;
+    }
+
+    current_state = start;
+    return do_start(false);
+}
+
+std::vector<std::size_t> NmdGeneric::do_reflect(const double measurements[], 
+                            const std::size_t observed_knobs[])
+{
+    ensure_profile_consistency(point_reflect, observed_knobs);
+    score_reflect = score(measurements);
+
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  INNER Reflect: ";
+        for (auto i=0ul; i<num_knobs; ++i)
+            std::cout << point_reflect[i] << " ";
+        std::cout << ":" << score_reflect << std::endl;
+    )
+
+    auto ts_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    logistics entry;
+    entry.knobs.assign(observed_knobs, observed_knobs+num_knobs);
+    entry.objectives.assign(measurements, measurements+num_objectives);
+    entry.cache_dt = cache_expire_dt_ms;
+    entry.cache_ts = ts_now;
+
+    cache[entry.knobs] = entry;
+
+    if ( score_reflect >= scores[0] && score_reflect < scores[num_knobs-1]) {
+        // VV: fo <= fr < f[n-1] then replace v[n] with vr and start over
+        for ( auto i=0ul; i<num_knobs; ++i)
+            simplex[num_knobs][i] = point_reflect[i];
+        scores[num_knobs] = score_reflect;
+        current_state = start;
+        return do_start(true);
+    } else if (score_reflect < scores[0]) {
+        double temp[num_knobs];
+        current_state = expand;
+        for (auto i=0ul; i<num_knobs; ++i)
+            temp[i] = centroid[i] + BETA * (point_reflect[i] - (double)centroid[i]);
+        
+        apply_constraint(temp);
+
+        for ( auto i=0ul; i<num_knobs; ++i)
+            point_expand[i] = temp[i];
+        generate_unique(point_expand, false);
+
+        auto key = std::vector<std::size_t>();
+        key.assign(point_expand, point_expand+num_knobs);
+        auto e = cache.find(key);
+
+        if ( e != cache.end() ) {
+            if ( ts_now - e->second.cache_ts < e->second.cache_dt ) {
+                return do_expand(e->second.objectives.data(),
+                                e->second.knobs.data());
+            }
+        }
+
+        return key;
+    } else if (scores[num_knobs-1] <= score_reflect 
+            &&  score_reflect < scores[num_knobs]) {
+        // VV: Reflect lies between f[n-1] and f[n] then contract (outside)
+        current_state = contract_out;
+        double temp[num_knobs];
+
+        for (auto i=0ul; i<num_knobs; ++i)
+            temp[i] = centroid[i] + GAMMA * (point_reflect[i] - (double)centroid[i]);
+        
+        apply_constraint(temp);
+
+        for ( auto i=0ul; i<num_knobs; ++i)
+            point_contract[i] = temp[i];
+        generate_unique(point_contract, false);
+
+        auto key = std::vector<std::size_t>();
+        key.assign(point_contract, point_contract+num_knobs);
+        auto e = cache.find(key);
+
+        if ( e != cache.end() ) {
+            if ( ts_now - e->second.cache_ts < e->second.cache_dt ) {
+                return do_contract_out(e->second.objectives.data(),
+                                e->second.knobs.data());
+            }
+        }
+
+        return key;
+    } else if (score_reflect >= scores[num_knobs]) {
+        // VV: Reflect > f[n] then contract (inside)
+        current_state = contract_in;
+        double temp[num_knobs];
+
+        for (auto i=0ul; i<num_knobs; ++i)
+            temp[i] = (double) centroid[i] - GAMMA * ((double)point_reflect[i] - (double)centroid[i]);
+        
+        apply_constraint(temp);
+
+        for ( auto i=0ul; i<num_knobs; ++i)
+            point_contract[i] = temp[i];
+        generate_unique(point_contract, false);
+
+        auto key = std::vector<std::size_t>();
+        key.assign(point_contract, point_contract+num_knobs);
+        auto e = cache.find(key);
+
+        if ( e != cache.end() ) {
+            if ( ts_now - e->second.cache_ts < e->second.cache_dt ) {
+                return do_contract_in(e->second.objectives.data(),
+                                e->second.knobs.data());
+            }
+        }
+
+        return key;
+    }
+
+    OUT_INFO(
+        std::cout << "[NMD|Info] Should never get here" << std::endl;
+    )
+
+    current_state = start;
+    return do_start(true);
+}
+
+std::vector<std::size_t> NmdGeneric::do_warmup(const double measurements[], 
+                            const std::size_t observed_knobs[])
+{
+    std::vector<std::size_t> ret;
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  INNER warmup" << std::endl;
+    )
+
+    if ( warmup_step > 0 ) {
+        auto last = warmup_step - 1;
+        ensure_profile_consistency(initial_config[last], observed_knobs);
+        memcpy(simplex[last], initial_config[last], sizeof(std::size_t)*num_knobs);
+        scores[last] = score(measurements);
+        auto key = std::vector<size_t>();
+        key.assign(observed_knobs, observed_knobs+num_knobs);
+        
+        logistics entry;
+
+        entry.cache_dt = cache_expire_dt_ms;
+        entry.cache_ts = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+        entry.knobs.assign(observed_knobs, observed_knobs+num_knobs);
+        entry.objectives.assign(measurements, measurements+num_objectives);
+
+        cache[key] = entry;
+        
+        OUT_DEBUG(
+            auto s = score(measurements);
+            std::cout << "[NMD|Dbg]  Score: " << s << " for ";
+            for( auto i=0ul; i<num_knobs; ++i)
+                std::cout << observed_knobs[i] << " ";
+            std::cout << std::endl;
+        )
+    }
+
+    if ( warmup_step == num_knobs +1 ) {
+        OUT_DEBUG(
+            std::cout << "[NMD|Dbg]  Warmup results" << std::endl;
+
+            for (const auto &e:cache) {
+                for (auto i=0ul; i<num_knobs; ++i)
+                    std::cout << e.second.knobs[i] << " ";
+                std::cout << ": ";
+                for (auto i=0ul; i<num_objectives; ++i)
+                    std::cout << e.second.objectives[i] << " ";
+                std::cout << " = " << score(e.second.objectives.data());
+                std::cout << std::endl;
+            }
+        )
+        
+        current_state = start;
+        return do_start(false);
+    }
+    
+    ret.assign(this->initial_config[warmup_step],
+                this->initial_config[warmup_step]+num_knobs);
+    
+    warmup_step ++;
+
+    OUT_INFO(
+        std::cout << "[NMD|Info] Warmup Explore: ";
+        for (auto i=0ul; i<num_knobs; ++i)
+            std::cout << ret[i] << " ";
+        
+        std::cout << std::endl;
+    )
+
+    return ret;
+}
+
+std::pair<std::vector<std::size_t>, bool> NmdGeneric::get_next(const double measurements[], 
+                            const std::size_t observed_knobs[])
+{
+    std::vector<std::size_t> ret;
+    #if defined(NMD_DEBUG_) || defined(NMD_INFO_)
+        const char *state_names[] = {
+            "warmup",
+            "start",
+            "reflect",
+            "expand",
+            "contract_in",
+            "contract_out",
+            "shrink"
+        };
+    #endif
+
+    OUT_DEBUG(
+        std::cout << "[NMD|Dbg]  Current stage " << state_names[current_state] << std::endl;
+    )
+    
+    switch (current_state) {
+        case warmup:
+            ret = do_warmup(measurements, observed_knobs);
+            break;
+        case start:
+            times_reentered_start = 0;
+            ret = do_start(true);
+            break;
+        case reflect:
+            ret = do_reflect(measurements, observed_knobs);
+            break;
+        case expand:
+            ret = do_expand(measurements, observed_knobs);
+            break;
+        case contract_in:
+            ret = do_contract_in(measurements, observed_knobs);
+            break;
+        case contract_out:
+            ret = do_contract_out(measurements, observed_knobs);
+            break;
+        case shrink:
+            ret = do_shrink();
+            break;
+        default:
+            std::cout << "Unknown state!" << std::endl;
+    }
+
+    OUT_INFO(
+        std::cout << "[NMD|Info] State " << state_names[current_state] << " proposes ";
+
+        for (auto i=0ul; i<num_knobs; ++i)
+            std::cout << ret[i] << " ";
+        std::cout << std::endl;
+    )
+
+    bool converged = false;
+    if ( current_state != warmup )
+    {
+        converged = test_convergence();
+    }
+
+    if ( converged ) {
+        sort_simplex(true);
+        ret.assign(simplex[0], simplex[0] + num_knobs);
+    }
+
+    return std::make_pair(ret, converged);
+}
+
+
+bool NmdGeneric::test_convergence()
+{
+    double avg, sum;
+
+    avg = 0.0;
+    sum = 0.0;
+
+    for ( auto i=0ul; i<num_knobs+1; ++i)
+        avg += scores[i];
+    
+    avg /= (num_knobs+1);
+
+    for ( auto i=0ul; i<num_knobs+1; ++i) {
+        double t = scores[i] - avg;
+        sum += t * t;        
+    }
+
+    sum /= num_knobs;
+    sum = sqrt(sum);
+
+    if (iteration >= max_iters || sum <= conv_threshold ) {
+        // if ( final_explore == false ) {
+        //     final_explore = true;
+
+        //     return false;
+        // } else {
+        //     return true;
+        // }
+        OUT_INFO(
+            std::cout << "[NMD|Info] Converged at " << sum 
+                      << " threshold: " << conv_threshold << std::endl;
+
+            std::cout << "[NMD|Info] Converged simplex" << std::endl;
+            for ( auto i=0ul; i<num_knobs+1; ++i) {
+                std::cout << "[NMD|Info] Score " << scores[i];
+                for ( auto j=0ul; j<num_knobs; ++j)
+                    std::cout << " " << simplex[i][j];
+                std::cout << std::endl;
+            }
+        )
+        return true;
+    }
+
+    return false;
+}
diff --git a/src/components/nmsimplex_bbincr.cpp b/src/components/nmsimplex_bbincr.cpp
index 97736cd..82ae4e9 100644
--- a/src/components/nmsimplex_bbincr.cpp
+++ b/src/components/nmsimplex_bbincr.cpp
@@ -9,458 +9,1424 @@
  * function with complex analytical evaluation)
  *
  */
-
+#include <vector>
+#include <algorithm>
 #include <allscale/components/nmsimplex_bbincr.hpp>
+#include <cmath>
+
 //#define NMD_DEBUG_ 1
-//#define NMD_INFO_ 1
 
-/* create the initial simplex
+#ifdef NMD_DEBUG_
+#define OUT_DEBUG(X) X
+#else
+#define OUT_DEBUG(X) \
+    {                \
+    }
+#endif
+namespace allscale
+{
+namespace components
+{
 
-   vector<doubl
 
- */
+NelderMead::NelderMead(const NelderMead &other)
+{
+    EPSILON = other.EPSILON;
+    state_ = other.state_;
+    
+    cache_.insert(other.cache_.begin(), other.cache_.end());
+    warming_up_step = other.warming_up_step;
+    convergence_reevaluating = other.convergence_reevaluating;
+
+    fc = other.fc;
+    fe = other.fe;
+    vs = other.vs;
+    vg = other.vg;
+    vh = other.vh;
+
+    for (auto i=0; i<NMD_NUM_KNOBS; ++i) {
+        constraint_max[i] = other.constraint_max[i];
+        constraint_min[i] = other.constraint_min[i];
+        vr[i] = other.vr[i];
+        ve[i] = other.ve[i];
+        vm[i] = other.vm[i];
+    }
 
-namespace allscale { namespace components {
+    for ( auto i=0; i<NMD_NUM_OBJECTIVES; ++i ) {
+        opt_weights[i] = other.opt_weights[i];
+        scale[i] = other.scale[i];
+    }
+
+    for (auto i=0; i<NMD_NUM_KNOBS+1; ++i )
+    {
+        for ( auto j=0; j<NMD_NUM_KNOBS; ++j ) {
+            v[i][j] = other.v[i][j];
+            initial_configurations[i][j] = other.initial_configurations[i][j];
+        }
+    }
+
+    should_update_constraints = true;
+    times_used_cached = 0;
+}
 
 //NelderMead::NelderMead(double (*objfunc)(double[]),double eps){
-NelderMead::NelderMead(double eps){
+NelderMead::NelderMead(double eps)
+{
 
-  EPSILON=eps;
+    EPSILON = eps;
 #ifdef NMD_INFO_
-  std::cout << "[NelderMead|INFO] Initial Convergence Threshold set is " << EPSILON << std::endl;
+    std::cout << "[NelderMead|INFO] Initial Convergence Threshold set is " << EPSILON << std::endl;
 #endif
-  itr=0;
-  state_ = start;
-
-  /* dynamically allocate arrays */
-
-  /* allocate the rows of the arrays */
-  v =  (double **) malloc ((n+1) * sizeof(double *));
-  f =  (double *) malloc ((n+1) * sizeof(double));
-  vr = (double *) malloc (n * sizeof(double));
-  ve = (double *) malloc (n * sizeof(double));
-  vc = (double *) malloc (n * sizeof(double));
-  vm = (double *) malloc (n * sizeof(double));
-
-  /* allocate the columns of the arrays */
-  for (i=0;i<=n;i++) {
-    v[i] = (double *) malloc (n * sizeof(double));
-  }
+    itr = 0;
+    state_ = warmup;
+    
+    warming_up_step = 0;
+    convergence_reevaluating = false;
+
+    for (auto i=0ul; i<NMD_NUM_OBJECTIVES; ++i)
+        scale[i] = 1.0;
 }
 
-void NelderMead::my_constraints(double x[])
+std::pair<int, NelderMead::direction> NelderMead::explore_next_extra(double *extra, int level, 
+                                direction dir, 
+                                int level_max, int level_nested_max)
 {
-  // round to integer and bring again with allowable margins
-  // todo fix: generalize
-  if (x[0] < constraint_min[0] || x[0] > constraint_max[0]){
-    x[0] = (constraint_min[0] + constraint_max[0])/2;
-  }
-
-  if (x[1] < constraint_min[1] || x[1] > constraint_max[1]){
-    x[1] = (constraint_min[1] + constraint_max[1])/2;
-  }
-
-  x[0]=round(x[0]);
-  x[1]=round(x[1]);
+    /*
+    const char *to_string[] = {
+        "up", "up_final", "down", "left", "right", "right_final"
+    };
+    */
+    if ( extra[0] == 0.0 && extra[1] == 0.0 ) {
+        extra[1] = 1.0;
+
+        return std::make_pair(level, dir);
+    }
+    switch (dir) {
+        case (direction::up):
+            if ( extra[1] < level ) {
+                extra[1] += 1.;
+            } else if( extra[0] < level_nested_max ) {
+                extra[0] += 1.;
+                dir = direction::right;
+            } else {
+                level ++;
+            }
+        break;
+
+        case (direction::up_final):
+            if ( extra[1] < level ) {
+                extra[1] += 1.;
+            } else if( extra[0] < level_nested_max ) {
+                extra[0] += 1.;
+                dir = direction::right_final;
+            } else {
+                level ++;
+            }
+        break;
+
+
+        case (direction::down):
+            if ( extra[1] > -level ) {
+                extra[1] -= 1.0;
+            } else if ( extra[0] > -level_nested_max ){
+                extra[0] -= 1.0;
+                dir = direction::left;
+            }
+        break;
+
+        case (direction::left):
+            if ( extra[0] > -level_nested_max ) {
+                extra[0] -= 1.0;
+            } else if (extra[1] < level ) {
+                extra[1] += 1.0;
+                dir = direction::up_final;
+            }
+        break;
+
+        case (direction::right):
+            if ( extra[0] < level_nested_max ) {
+                extra[0] += 1.;
+            } else if ( extra[1] <= level ) {
+                extra[1] -= 1.;
+                dir = direction::down;
+            }
+        break;
+        
+        case (direction::right_final):
+        if ( extra[0] < 0. ) {
+            extra[0] += 1.;
+        } else {
+            level ++; 
+            extra[0] = 0.0;
+            extra[1] = level;
+            dir = direction::right;
+        }
+        break;
+    }
+
+    return std::make_pair(level, dir);
 }
 
-/* FIXME: generalize */
-void NelderMead::initialize_simplex(double params[][2], double values[], double constraint_min[],double constraint_max[])
+template <typename F>
+void NelderMead::generate_new(F &gen)
 {
-  int i,j;
+    double extra[] = {0, 0};
+    double *new_set;
+    int i = 0;
+    int max_combinations = (constraint_max[0] - constraint_min[0]+1) 
+                            * (constraint_max[1] - constraint_min[1]+1);
+    int level = 1;
+    int max_nested_level = constraint_max[1] - constraint_min[1] +1;
+    int max_level = constraint_max[0] - constraint_min[0] +1;
+    direction dir = direction::right;
+
+    // VV: Search for a twice as big space to take into account that
+    //     new_set is not *actually* at 0, 0
+
+    max_level *= 2;
+    max_nested_level *=2;
+    auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    // VV: Restrict search-grid to a maximum block of 5x5
+    int retries = 0;
+    const int retries_threshold = 5*5;
+    int is_same;
+    do
+    {
+        new_set = gen(extra);
+        
+        auto key = std::make_pair((int)new_set[0], (int)new_set[1]);
+        auto entry = cache_.find(key);
+        
+        is_same = 0;
+
+        if ( entry != cache_.end() ) {
+            auto dt = timestamp_now - entry->second._cache_timestamp;
+            is_same = dt <= entry->second._cache_expires_dt;
+        }
+        
+        ++ retries;
+        if ( ( level < max_level +1) 
+             && is_same 
+             && max_combinations > (NMD_NUM_KNOBS + 1)
+             && retries < retries_threshold )
+        {
+            # if 0
+            extra[0] = rand() % (int)(constraint_max[0] - constraint_min[0]) 
+                            + (int)constraint_min[0] 
+                            - (int)(0.5 * (constraint_max[0] - constraint_min[0]));
+
+            extra[1] = rand() % (int)(constraint_max[1] - constraint_min[1]) 
+                                + (int)constraint_min[1] 
+                                - (int)(0.5 * (constraint_max[1] - constraint_min[1]));
+            #else
+            auto logistics = explore_next_extra(extra, level, dir, 
+                                                max_level, max_nested_level);
+            level = logistics.first;
+            dir = logistics.second;
+
+            #endif
+            /*
+            OUT_DEBUG(
+                std::cout << "[NelderMead|Debug] Rejecting " 
+                    << new_set[0] << " " << new_set[1] 
+                    << " will try offset " << extra[0] << " " << extra[1] <<  std::endl;
+            )
+            */
+        } else {
+            break;
+        }
+    } while ( 1 );
+
+    if ( retries >= retries_threshold ) {
+        extra[0] = 0;
+        extra[1] = 0;
 
-  for (i=0;i<=n;i++) {
-    for (j=0;j<n;j++) {
-  	  v[i][j] = params[i][j];
+        gen(extra);
     }
-    f[i]=values[i];
-    this->constraint_min[i]=constraint_min[i];
-    this->constraint_max[i]=constraint_max[i];
-  }
-  itr=0;
 }
 
+void NelderMead::my_constraints(double x[])
+{
+    for (auto i = 0u; i < 2u; ++i)
+    {
+        if (x[i] < constraint_min[i])
+            x[i] = constraint_min[i];
+        else if (x[i] > constraint_max[i])
+            x[i] = constraint_max[i];
+    }
+
+    x[0] = round(x[0]);
+    x[1] = round(x[1]);
+}
 
-/* print out the initial values */
-void NelderMead::print_initial_simplex()
+bool NelderMead::cache_update(int threads, int freq_idx,
+                              const double objectives[], bool add_if_new)
 {
-  int i,j;
-  std::cout << "[NelderMead DEBUG] Initial Values\n";
-  for (j=0;j<=n;j++) {
-    for (i=0;i<n;i++) {
-      std::cout << v[j][i] << ",";
-    }
-    std::cout << "Objective value = " << f[j] << std::endl;
-  }
+    auto key = std::make_pair(threads, freq_idx);
+    auto past = cache_.find(key);
+
+    if (past != cache_.end())
+    {
+        auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+        double abs_diff = 0;
+        for (auto j = 0; j < NMD_NUM_OBJECTIVES; ++j)
+        {
+            abs_diff += past->second.objectives[j] - objectives[j];
+            past->second.objectives[j] = objectives[j];
+        }
+
+        past->second._cache_timestamp = timestamp_now;
+        // VV: Entries which remain relatively same should be explored less frequently
+        if (abs_diff > 0.1)
+            past->second._cache_expires_dt = CACHE_EXPIRE_AFTER_MS;
+        else if (past->second._cache_expires_dt < CACHE_EXPIRE_AFTER_MS * 1024)
+            past->second._cache_expires_dt *= 2;
+
+        return true;
+    }
+    else if (add_if_new)
+    {
+        auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+        optstepresult entry;
+        entry._cache_timestamp = timestamp_now;
+        entry._cache_expires_dt = CACHE_EXPIRE_AFTER_MS;
+        entry.threads = threads;
+        entry.freq_idx = freq_idx;
+
+        for (auto j = 0; j < NMD_NUM_OBJECTIVES; ++j)
+            entry.objectives[j] = objectives[j];
+
+        cache_.insert(std::make_pair(key, entry));
+
+        return true;
+    }
+
+    return false;
 }
 
+void NelderMead::invalidate_cache()
+{
+    should_invalidate_cache = true;
+}
 
-/* print out the value at each iteration */
-void NelderMead::print_iteration()
+void NelderMead::reevaluate_scores()
 {
-  int i,j;
-  std::cout << "[NelderMead DEBUG] Iteration " << itr << std::endl;
-  //printf("Iteration %d\n",itr);
-  for (j=0;j<=n;j++) {
-    std::cout << "[NelderMead DEBUG] Vertex-" << j+1 << "=(";
-    for (i=0;i<n;i++) {
-      //printf("%f %f\n\n",v[j][i],f[j]);
-      std::cout << v[j][i];
-      if (i<n-1)
-        std::cout << "," ;
-    }
-    std::cout << ")=" << f[j] << std::endl;
-  }
-  std::cout << "[NelderMead DEBUG] Current Objective Minimum is at: " << f[vs] << std::endl;
-  std::cout << "[NelderMead DEBUG] f[vs]= " << f[vs] << ", vs = " << vs << std::endl;
-  std::cout << "[NelderMead DEBUG] f[vh]= " << f[vh] << ", vh = " << vh << std::endl;
-  std::cout << "[NelderMead DEBUG] f[vg]= " << f[vg] << ", vg = " << vg << std::endl;
+    should_reevaluate_scores = true;
 }
 
+void NelderMead::do_invalidate_cache()
+{
+    cache_.clear();
+    should_invalidate_cache = false;
+}
 
-/* find the index of the largest value */
-int NelderMead::vg_index()
+void NelderMead::do_reevaluate_scores()
 {
-  int j;
-  int vg=0;
+    auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
 
-  for (j=0;j<=n;j++) {
-    if (f[j] > f[vg]) {
-      vg = j;
+    std::vector<optstepresult> fresh;
+    should_reevaluate_scores = false;
+
+    for ( const auto &entry: cache_ ) {
+        auto dt = timestamp_now - entry.second._cache_timestamp;
+        if ( dt <= entry.second._cache_expires_dt )
+            fresh.push_back(entry.second);
+    }
+
+    if ( fresh.size() >= NMD_NUM_KNOBS +1 ) {
+        std::sort(fresh.begin(), fresh.end(), 
+            [this](const optstepresult &l, const optstepresult &r) mutable -> int {
+                return evaluate_score(l.objectives, nullptr) < 
+                        evaluate_score(r.objectives, nullptr);
+            });
+        
+        for (auto i=0ul; i<NMD_NUM_KNOBS+1; ++i ) {
+            v[i][0] = fresh[i].threads;
+            v[i][1] = fresh[i].freq_idx;
+        }
+
+        vs = 0;
+        vh = 1;
+        vg = 2;
+    }
+    else
+    {
+        for (auto i=0ul; i<NMD_NUM_KNOBS+1; ++i )
+        {
+            auto key = std::make_pair( (int)v[i][0], (int)v[i][1] );
+            auto entry = cache_.find(key);
+
+            if ( entry != cache_.end() ) {
+                f[i] = evaluate_score(entry->second.objectives, opt_weights);
+            }
+        }
+        sort_vertices();
     }
-  }
-  return vg;
+    OUT_DEBUG(
+            std::cout << "[NelderMead|DEBUG] Re-Evaluated all scores" << std::endl;
+            print_initial_simplex();
+        )
+    centroid();
 }
 
+void NelderMead::set_scale(const double scale[NMD_NUM_OBJECTIVES])
+{
+    for ( auto i=0ul; i<NMD_NUM_OBJECTIVES; ++i )
+        this->scale[i] = scale[i];
+    
+    reevaluate_scores();
+}
 
-/* find the index of the smallest value */
-int NelderMead::vs_index()
+double NelderMead::evaluate_score(const double objectives[], const double *weights)
 {
-  int j;
-  int vs=0;
+    double score;
+    // VV: [time, energy/power, resources]
+    
+    if (weights == nullptr)
+        weights = opt_weights;
+
+    #if 0
+    score = 0.0;
+    for (auto i = 0; i < NMD_NUM_OBJECTIVES; ++i)
+    {
+        double t = objectives[i] / scale[i];
+        score += t * t * weights[i];
+    }
+    #else 
+    score = 1.0;
+    for ( auto i=0; i<NMD_NUM_OBJECTIVES; ++ i) {
+        score *= std::exp((objectives[i]/scale[i]) * weights[i]);
+    }
+    #endif
+    return score;
+}
 
-  for (j=0;j<=n;j++) {
-    if (f[j] < f[vs]) {
-      vs = j;
+void NelderMead::set_weights(const double weights[3])
+{
+    opt_weights[0] = weights[0];
+    opt_weights[1] = weights[1];
+    opt_weights[2] = weights[2];
+    OUT_DEBUG(
+        std::cout << "[NelderMead|DEBUG] Weights: " 
+                << opt_weights[0] << " "
+                << opt_weights[1] << " "
+                << opt_weights[2] << std::endl;
+    )
+}
+#if 0
+void NelderMead::initialize_simplex(const double weights[3],
+                                    const double constraint_min[2],
+                                    const double constraint_max[2])
+{
+    int i, j;
+    auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    for (i = 0; i < NMD_NUM_KNOBS; i++)
+    {
+        this->constraint_min[i] = constraint_min[i];
+        this->constraint_max[i] = constraint_max[i];
+    }
+
+    OUT_DEBUG(
+        std::cout << "[NelderMead|Debug] Initialize contraints " << std::endl;
+        std::cout << constraint_min[0] 
+                    << ":" << constraint_max[0] << std::endl;
+        std::cout << constraint_min[1] 
+                    << ":" << constraint_max[1] << std::endl;
+    )
+
+    set_weights(weights);
+    state_ = warmup;
+    itr = 0;
+    warming_up_step = 0;
+    convergence_reevaluating = false;
+    cache_.clear();
+
+    for (i=0; i<NMD_NUM_KNOBS+1; ++i) {
+        int is_ok = 1;
+        do {
+            
+            for (j=0; j<NMD_NUM_KNOBS; ++j)
+                initial_configurations[i][j] = constraint_min[j] + rand() % (int) (constraint_max[j] - constraint_min[j]+1);
+            
+            is_ok = 1;
+
+            for (auto c=0; c<i && is_ok == 1; ++c)
+            {
+                is_ok = 0;
+                for ( j=0; j<NMD_NUM_KNOBS; ++j )
+                    is_ok |= (initial_configurations[c][j] != initial_configurations[i][j]);
+            }
+
+        } while (is_ok == 0);
+
+        OUT_DEBUG(
+            std::cout << "[NelderMead|DEBUG] Random initial simplex [" << i << "]: ";
+            for ( j =0; j<NMD_NUM_KNOBS; ++j) 
+                std::cout << initial_configurations[i][j] << " ";
+            std::cout << std::endl;
+        )
     }
-  }
-  return vs;
 }
+#endif
 
+void NelderMead::update_constraints(const double constraint_min[NMD_NUM_KNOBS],
+							    const double constraint_max[NMD_NUM_KNOBS])
+{
+    for (auto i=0; i<NMD_NUM_KNOBS; ++i) {
+        next_constraint_min[i] = constraint_min[i];
+        next_constraint_max[i] = constraint_max[i];
+    }
 
-/* find the index of the second largest value */
-int NelderMead::vh_index()
+    should_update_constraints = true;
+}
+
+/* FIXME: generalize */
+void NelderMead::initialize_simplex(const double weights[3],
+                                    const double initial_simplex[][NMD_NUM_KNOBS],
+                                    const double constraint_min[2],
+                                    const double constraint_max[2])
+{
+    int i, j;
+    long timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+
+    for (i=0; i<NMD_NUM_KNOBS; ++i )
+    {
+        this->constraint_min[i] = constraint_min[i];
+        this->constraint_max[i] = constraint_max[i];
+    }
+
+    OUT_DEBUG(
+        std::cout << "[NelderMead|Debug] Initialize contraints " << std::endl;
+        std::cout << constraint_min[0] 
+                    << ":" << constraint_max[0] << std::endl;
+        std::cout << constraint_min[1] 
+                    << ":" << constraint_max[1] << std::endl;
+    )
+
+    set_weights(weights);
+    state_ = warmup;
+    itr = 0;
+    warming_up_step = 0;
+    convergence_reevaluating = false;
+    cache_.clear();
+    if (initial_simplex == nullptr)
+    {
+        #if 0
+        int threads_low = round(0.25 * (constraint_max[0] - constraint_min[1]) 
+                    + constraint_min[1]);
+        int threads_med = round(0.5 * (constraint_max[0] - constraint_min[1])
+                    + constraint_min[1]);
+        int threads_high = constraint_max[0] * 0.75;
+
+        initial_configurations[0][0] = threads_low;
+        initial_configurations[0][1] = (int)constraint_min[1];
+
+        initial_configurations[1][0] = threads_high;
+        initial_configurations[1][1] = (int)constraint_min[1];
+
+        initial_configurations[2][0] = threads_high;
+        initial_configurations[2][1] = (int)constraint_max[1];
+        #else
+        for (i=0; i<NMD_NUM_KNOBS+1; ++i) {
+            int is_ok = 1;
+            do {
+                
+                for (j=0; j<NMD_NUM_KNOBS; ++j)
+                    initial_configurations[i][j] = constraint_min[j] + rand() % (int) (constraint_max[j] - constraint_min[j]+1);
+                
+                is_ok = 1;
+
+                for (auto c=0; c<i && is_ok == 1; ++c)
+                {
+                    is_ok = 0;
+                    for ( j=0; j<NMD_NUM_KNOBS; ++j )
+                        is_ok |= (initial_configurations[c][j] != initial_configurations[i][j]);
+                }
+
+            } while (is_ok == 0);
+        }
+        #endif
+    } else {
+        double knob_set[NMD_NUM_KNOBS];
+        for (i=0; i<NMD_NUM_KNOBS+1; ++i ) {
+            for (j=0; j<NMD_NUM_KNOBS; ++j ) {
+                knob_set[j] = initial_simplex[i][j];
+            }
+            my_constraints(knob_set);
+            for (j=0; j<NMD_NUM_KNOBS; ++j ) {
+                initial_configurations[i][j] = (int) knob_set[j];
+            }
+        }
+    }
+
+    OUT_DEBUG(
+        for (auto i=0; i<NMD_NUM_KNOBS+1; ++i) {
+            std::cout << "[NelderMead|DEBUG] (initialize) initial simplex [" << i << "]: ";
+            for ( j =0; j<NMD_NUM_KNOBS; ++j) 
+                std::cout << initial_configurations[i][j] << " ";
+            std::cout << std::endl;
+        }
+    )
+}
+
+/* print out the initial values */
+void NelderMead::print_initial_simplex()
 {
-  int j;
+    int i, j;
+    std::cout << "[NelderMead DEBUG] Initial Values (Order indices:" 
+        << vs << ", " << vh << ", " << vg << ")" << std::endl;
+
+    for (j = 0; j < NMD_NUM_KNOBS + 1; j++)
+    {
+        
+        for (i = 0; i < NMD_NUM_KNOBS; i++)
+        {
+            std::cout << v[j][i] << ",";
+        }
+        const int threads = (int) v[j][0];
+        const int freq_idx = (int) v[j][1];
 
-  for (j=0;j<=n;j++) {
-    if (f[j] > f[vh] && f[j] < f[vg]) {
-      vh = j;
+        auto e = cache_.find(std::make_pair(threads, freq_idx));
+        std::cout << " Objective value = "<< std::flush << f[j] << std::flush;
+
+        if ( e == cache_.end() )
+        {
+            std::cout << " (not in cache)" << std::flush << std::endl;
+        } else {
+            std::cout << " OBJs: " << std::flush
+                     << e->second.objectives[0] << " "
+                     << e->second.objectives[1] << " "
+                     << e->second.objectives[2] << " "
+                     << std::endl;
+        }
+        std::cout << std::flush;
     }
-  }
-  return vh;
 }
 
+/* print out the value at each iteration */
+void NelderMead::print_iteration()
+{
+    int i, j;
+    std::cout << "[NelderMead DEBUG] Iteration " << itr << std::endl;
+    //printf("Iteration %d\n",itr);
+    for (j = 0; j <= n; j++)
+    {
+        std::cout << "[NelderMead DEBUG] Vertex-" << j + 1 << "=(";
+        for (i = 0; i < n; i++)
+        {
+            //printf("%f %f\n\n",v[j][i],f[j]);
+            std::cout << v[j][i];
+            if (i < n - 1)
+                std::cout << ",";
+        }
+        std::cout << ")=" << f[j] << std::endl;
+    }
+
+    std::cout << "[NelderMead DEBUG] Current Objective Minimum is at: " << f[vs] << std::endl;
+    std::cout << "[NelderMead DEBUG] f[vs]= " << f[vs] << ", vs = " << vs << std::endl;
+    std::cout << "[NelderMead DEBUG] f[vh]= " << f[vh] << ", vh = " << vh << std::endl;
+    std::cout << "[NelderMead DEBUG] f[vg]= " << f[vg] << ", vg = " << vg << std::endl;
+}
 
 /* calculate the centroid */
 void NelderMead::centroid()
 {
-  int j,m;
-  double cent;
-
-  for (j=0;j<=n-1;j++) {
-    cent=0.0;
-    for (m=0;m<=n;m++) {
-      if (m!=vg) {
-	      cent += v[m][j];
-      }
-    }
-    vm[j] = cent/n;
-  }
+    int j, m;
+    double cent;
+
+    for (j = 0; j < NMD_NUM_KNOBS; j++)
+    {
+        cent = 0.0;
+        for (m = 0; m < NMD_NUM_KNOBS +1; m++)
+        {
+            if (m != vg)
+            {
+                cent += v[m][j];
+            }
+        }
+        vm[j] = cent / n;
+    }
+
+    my_constraints(vm);
+
+    OUT_DEBUG (
+        std::cout << "[NelderMead|DEBUG] New Centroid: " 
+        << vm[0] << " " << vm[1] << std::endl;
+    )
 }
 
-optstepresult NelderMead::step(double param)
+void NelderMead::sort_vertices()
 {
-  optstepresult res;
-  res.threads=0;
-  res.freq_idx=-1;
-  switch (state_){
+    // VV: -1 is used for padding because the index to this map will never evaluate to 0
+    int map_to_index[] = {
+        0, 0, 1, 0, 2, 0, 0, 0};
 
-    /** ITERATION START **/
-    case start:
-      itr++;
-#ifdef NMD_DEBUG_
-      std::cout << "[NelderMead DEBUG] State = Start" << std::endl;
-      print_initial_simplex();
-#endif
-      // todo: implement here the simplex initialization, currently this is
-      // done in the constructor
-
-      /* find the index of the largest value (W) */
-      vg = vg_index();
-
-      /* find the index of the smallest value (B) */
-      vs = vs_index();
-
-      /* find the index of the second largest value (G) */
-      vh = vh_index();
-
-      /* calculate the centroid */
-      centroid();
-
-      /* reflect vg to new vertex vr */
-      for (j=0;j<=n-1;j++) {
-        /*vr[j] = (1+ALPHA)*vm[j] - ALPHA*v[vg][j];*/
-        /*
-        std::cout << "vm[" << j << "]=" << vm[j] << std::endl;
-        std::cout << "v[vg" << j << "]=" << v[vg][j] << std::endl;
-        std::cout << "ALPHA=" << ALPHA << std::endl;
-        */
-        vr[j] = vm[j]+ALPHA*(vm[j]-v[vg][j]);
-      }
-      my_constraints(vr);
-#ifdef NMD_DEBUG_
-      std::cout << "[NelderMead DEBUG] Reflection Parameter = ("
-                << vr[0] << "," << vr[1] << ")"
-                << std::endl;
-#endif
-      // enter reflection state
-      state_=reflection;
-      res.threads=vr[0];
-      res.freq_idx=vr[1];
+    vg = vs = vh = 0;
 
-      break;
+    // VV: Compute greatest, smallest, and half-point
+    for (i = 0; i <= n; ++i)
+    {
+        vg = f[i] > f[vg] ? i : vg;
+        vs = f[i] < f[vs] ? i : vs;
+    }
 
-    /** REFLECTION **/
+    // VV: Find out what's the half-point by using a bitmap,
+    //     when vg==vs that means that all points are equal
+    if (vg != vs)
+    {
+        vh = 1 + 2 + 4 - (1 << vg) - (1 << vs);
+        vh = map_to_index[vh];
+    }
+    else
+    {
+        vg = 2;
+        vh = 1;
+        vs = 0;
+    }
+}
+
+optstepresult NelderMead::do_step_start()
+{
+    optstepresult res;
+    times_used_cached ++;
+
+    OUT_DEBUG(
+        std::cout << "[NelderMead DEBUG] State = Start" << std::endl;
+        print_initial_simplex();
+    )
+
+    sort_vertices();
+
+    centroid();   
+
+    // VV: Try not to pick a knob_set that already exists in `v`
+    auto gen_new = [this](double *extra) mutable -> double* {
+        
+        for (j = 0; j < NMD_NUM_KNOBS; j++)
+            vr[j] = vm[j] + ALPHA * (vm[j] - v[vg][j]) - extra[j];
+       
+        my_constraints(vr);
+
+        return vr;
+    };
+    
+    generate_new(gen_new);
 
-    /** This state is entered when we have received a sample of the objective
-     ** function at the reflection vertex
-     **/
-    case reflection:
-#ifdef NMD_DEBUG_
-      std::cout << "[NelderMead DEBUG] State = Reflection" << std::endl;
-#endif
-      fr=param;
-      //fr = objfunc(vr);
-
-		  if (fr < f[vh]){ // f(R) < f(G) - Case (i)
-        if (fr >= f[vs]) { // f(R)>f(B)
-          for (j=0;j<=n-1;j++) { // replace W with R and end iteration
-	         v[vg][j] = vr[j];
-          }
-          f[vg] = fr;
-          updateObjectives();
-          state_=start;
-          break;
-        }
-
-        /* investigate a step further through expansion in this direction */
-        else{
-          for (j=0;j<=n-1;j++) {
-            /*ve[j] = GAMMA*vr[j] + (1-GAMMA)*vm[j];*/
-            ve[j] = vm[j]+GAMMA*(vr[j]-vm[j]);
-          }
 #ifdef NMD_DEBUG_
-      std::cout << "[NelderMead DEBUG] Expansion Parameter = ("
-                << ve[0] << "," << ve[1] << ")"
-                << std::endl;
+    std::cout << "[NelderMead DEBUG] Reflection Parameter = ("
+              << vr[0] << "," << vr[1] << ")"
+              << std::endl;
 #endif
-          my_constraints(ve);
-          // enter the state waiting for a sampled value of the objective function
-          // at the expansion vertex
-          state_=expansion;
-          res.threads=ve[0];
-          res.freq_idx=ve[1];
-
-          break;
-        }
-
-      }else{ // f(R) > f(G) - Case (ii)
-        if (fr < f[vg]) { // f(R) < f(W)
-          for (j=0;j<=n-1;j++) {  // replace W with R
-           v[vg][j] = vr[j];
-          }
-          f[vg] = fr;
-        }
-
-        if (fr < f[vg] && fr >= f[vh]) {
-	        /* perform outside contraction */
-	        for (j=0;j<=n-1;j++) {
-	          /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j];*/
-	          vc[j] = vm[j]+BETA*(vr[j]-vm[j]);
-	        }
+    // enter reflection state
+    state_ = reflection;
+    res.threads = vr[0];
+    res.freq_idx = vr[1];
+
+    auto key = std::make_pair(res.threads, res.freq_idx);
+
+    auto entry = cache_.find(key);
+
+    //VV: Fixme, remove recursion due to cache
+    if (entry != cache_.end() && times_used_cached < 15)
+    {
+        auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+        auto dt = timestamp_now - entry->second._cache_timestamp;
+
+        if (dt < entry->second._cache_expires_dt)
+        {
+            return do_step_reflect(entry->second.objectives,
+                    entry->second.threads,
+                    entry->second.freq_idx);
+        }
+    }
+
+    return res;
+}
+
+optstepresult NelderMead::do_step_reflect(const double objectives[], 
+            double knob1, double knob2)
+{
+    optstepresult res;
 #ifdef NMD_DEBUG_
-      std::cout << "[NelderMead DEBUG] Contraction Parameter = ("
-                << vc[0] << "," << vc[1] << ")"
-                << std::endl;
+    std::cout << "[NelderMead DEBUG] State = Reflection" << std::endl;
 #endif
-          my_constraints(vc);
-          // enter the state waiting for a sampled value of the objective function
-          // at the outside contraction vertex
-          state_=contraction;
-          res.threads=vc[0];
-          res.freq_idx=vc[1];
-          break;
-        } else {
-	        /* perform inside contraction */
-	        for (j=0;j<=n-1;j++) {
-	          /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j];*/
-	          vc[j] = vm[j]-BETA*(vm[j]-v[vg][j]);
-	        }
+    // VV: Make sure that we actually profiled what we meant to
+    double profiled[] = {knob1, knob2};
+    my_constraints(profiled);
+
+    if ( vr[0] != profiled[0] || vr[1] != profiled[1] ) {
+        std::cout << "[NelderMead|WARN] Meant to profile " << vr[0] << " knob1 "
+                     "but ended up using " << profiled[0] << std::endl;
+        std::cout << "[NelderMead|WARN] Meant to profile " << vr[1] << " knob2 "
+                     "but ended up using " << profiled[1] << std::endl;
+        
+        auto key = std::make_pair((int)vr[0], (int)vr[1]);
+        auto iter = cache_.find(key);
+        if ( iter != cache_.end() ) {
+            iter->second.threads = profiled[0];
+            iter->second.freq_idx = profiled[1];
+        }
+
+        vr[0] = profiled[0];
+        vr[1] = profiled[1];
+
+        cache_update((int)vr[0], (int)vr[1], objectives, true);
+    }
+
+    fr = evaluate_score(objectives, opt_weights);
+    
+    if ((f[vs] <= fr) && (fr < f[vh]))
+    {
+        // VV: REFLECTED point is better than the SECOND BEST
+        //     but NOT better than the BEST
+        //     Replace WORST point with REFLECTED
+        for (j = 0; j <= n - 1; j++)
+        {
+            v[vg][j] = vr[j];
+        }
+
+        my_constraints(v[vg]);
+
+        f[vg] = fr;
+
+        const int threads = (int)(v[vg][0]);
+        const int freq_idx = (int)(v[vg][1]);
+
+        cache_update(threads, freq_idx, objectives, true);
+
+        state_ = start;
+        return do_step_start();
+    }
+    else if (fr < f[vs])
+    {
+        // VV: REFLECTED is better than BEST
+        auto gen_new = [this](double *extra) mutable -> double* {
+            for (j = 0; j < NMD_NUM_KNOBS; j++)
+                ve[j] = vm[j] + GAMMA * (vr[j] - vm[j]) - extra[j];
+                
+            my_constraints(ve);
+
+            return ve;
+        };
+    
+        generate_new(gen_new);
+
+        // VV: Now evaluate EXPANDED
+        res.threads = ve[0];
+        res.freq_idx = ve[1];
+
+        state_ = expansion;
+
+        auto key = std::make_pair(res.threads, res.freq_idx);
+
+        auto entry = cache_.find(key);
+
+        if (entry != cache_.end())
+        {
+            auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+            auto dt = timestamp_now - entry->second._cache_timestamp;
+
+            if (dt < entry->second._cache_expires_dt)
+            {
+                return do_step_expand(entry->second.objectives,
+                    entry->second.threads,
+                    entry->second.freq_idx);
+            }
+        }
+
+        return res;
+    }
+    else if ((f[vh] <= fr) && (fr < f[vg]))
+    {
+        // VV: REFLECTED between SECOND BEST and WORST
+        auto gen_new = [this](double *extra) mutable -> double* {
+            for (j = 0; j < NMD_NUM_KNOBS; j++)
+                vc[j] = vm[j] + BETA * (vr[j] - vm[j]) - extra[j];
+                
+            my_constraints(vc);
+
+            return vc;
+        };
+
+        generate_new(gen_new);  
+        
+        // VV: Now evaluate EXPANDED
+        res.threads = vc[0];
+        res.freq_idx = vc[1];
+
+        state_ = contraction_out;
+
+        auto key = std::make_pair(res.threads, res.freq_idx);
+
+        auto entry = cache_.find(key);
+
+        if (entry != cache_.end())
+        {
+            auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+            auto dt = timestamp_now - entry->second._cache_timestamp;
+
+            if (dt < entry->second._cache_expires_dt)
+            {
+                return do_step_contract_out(entry->second.objectives,
+                    entry->second.threads,
+                    entry->second.freq_idx);
+            }
+        }
+
+        return res;
+    }
+    else
+    {
+        // VV: REFLECTED worse than WORST
+        auto gen_new = [this](double *extra) mutable -> double* {
+            for (j = 0; j < NMD_NUM_KNOBS; j++)
+                vc[j] = vm[j] - BETA * (vr[j] - vm[j]) - extra[j];
+                
+            my_constraints(vc);
+
+            return vc;
+        };
+
+        generate_new(gen_new);
+
+        // VV: Now evaluate EXPANDED
+        res.threads = vc[0];
+        res.freq_idx = vc[1];
+
+        state_ = contraction_in;
+        auto key = std::make_pair(res.threads, res.freq_idx);
+
+        auto entry = cache_.find(key);
+
+        if (entry != cache_.end())
+        {
+            auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+            auto dt = timestamp_now - entry->second._cache_timestamp;
+
+            if (dt < entry->second._cache_expires_dt)
+            {
+                return do_step_contract_in(entry->second.objectives,
+                    entry->second.threads,
+                    entry->second.freq_idx);
+            }
+        }
+
+        return res;
+    }
+}
+
+optstepresult NelderMead::do_step_expand(const double objectives[],
+    double knob1, double knob2)
+{
 #ifdef NMD_DEBUG_
-      std::cout << "[NelderMead DEBUG] Contraction Parameter = ("
-                << vc[0] << "," << vc[1] << ")"
-                << std::endl;
+    std::cout << "[NelderMead DEBUG] State = Expansion" << std::endl;
 #endif
-	        my_constraints(vc);
-          state_=contraction;
-          res.threads=vc[0];
-          res.freq_idx=vc[1];
-          break;
+    fe = evaluate_score(objectives, nullptr);
+
+    double profiled[] = {knob1, knob2};
+    my_constraints(profiled);
+
+    if ( ve[0] != profiled[0] || ve[1] != profiled[1] ) {
+        std::cout << "[NelderMead|WARN] Meant to profile expand " << ve[0] << " knob1 "
+                     "but ended up using " << profiled[0] << std::endl;
+        std::cout << "[NelderMead|WARN] Meant to profile expand " << ve[1] << " knob2 "
+                     "but ended up using " << profiled[1] << std::endl;
+        
+        auto key = std::make_pair((int)ve[0], (int)ve[1]);
+        auto iter = cache_.find(key);
+        if ( iter != cache_.end() ) {
+            iter->second.threads = profiled[0];
+            iter->second.freq_idx = profiled[1];
         }
 
+        ve[0] = profiled[0];
+        ve[1] = profiled[1];
 
-    /** EXPANSION **/
+        cache_update((int)ve[0], (int)ve[1], objectives, true);
+    }
 
-    /** This state is entered when we have received a sample of the objective
-     ** function at the expansion vertex
-     **/
-    case expansion:
+    if (fe < fr)
+    {
+        // VV: EXPANDED point is better than REFLECTIVE
+        //     Replace WORST with EXPANDED
+        for (j = 0; j <= n - 1; j++)
+        {
+            v[vg][j] = ve[j];
+        }
+        f[vg] = fe;
+    }
+    else
+    {
+        // VV: Replace WORST with REFLECTED
+        for (j = 0; j <= n - 1; j++)
+        {
+            v[vg][j] = vr[j];
+        }
+        f[vg] = fr;
+    }
+
+    state_ = start;
+    const int threads = (int)(v[vg][0]);
+    const int freq_idx = (int)(v[vg][1]);
+
+    cache_update(threads, freq_idx, objectives, true);
+    return do_step_start();
+}
+
+optstepresult NelderMead::do_step_contract_in(const double objectives[],
+    double knob1, double knob2)
+{
+    int j;
 #ifdef NMD_DEBUG_
-      std::cout << "[NelderMead DEBUG] State = Expansion" << std::endl;
+    std::cout << "[NelderMead|DEBUG] State = ContractionIN" << std::endl;
 #endif
-      fe=param;
-      //fe = objfunc(ve);
-      if (fe < f[vs]) { // if f(E)<f(B)
-  	    for (j=0;j<=n-1;j++) { // replace W with E
-  	      v[vg][j] = ve[j];
-  	    }
-  	    f[vg] = fe;
-      }
-      else {
-  	    for (j=0;j<=n-1;j++) { // replace W with E
-  	      v[vg][j] = vr[j];
-  	    }
-  	    f[vg] = fr;
-      }
-      updateObjectives();
-      state_=start;
-      break;
-
-    /** CONTRACTION **/
-
-    /** This state is entered when we have received a sample of the objective
-     ** function at the contraction vertex
-     **/
-    case contraction:
+    fc = evaluate_score(objectives, nullptr);
+
+    double profiled[] = {knob1, knob2};
+    my_constraints(profiled);
+
+    if ( vc[0] != profiled[0] || vc[1] != profiled[1] ) {
+        std::cout << "[NelderMead|WARN] Meant to profile contract " << vc[0] << " knob1 "
+                     "but ended up using " << profiled[0] << std::endl;
+        std::cout << "[NelderMead|WARN] Meant to profile contract " << vc[1] << " knob2 "
+                     "but ended up using " << profiled[1] << std::endl;
+        
+        auto key = std::make_pair((int)vc[0], (int)vc[1]);
+        auto iter = cache_.find(key);
+        if ( iter != cache_.end() ) {
+            iter->second.threads = profiled[0];
+            iter->second.freq_idx = profiled[1];
+        }
+
+        vc[0] = profiled[0];
+        vc[1] = profiled[1];
+
+        cache_update((int)vc[0], (int)vc[1], objectives, true);
+    }
+
+    if (fc <= f[NMD_NUM_KNOBS])
+    {
+        // VV: CONTRACTED_I is better than WORST
+        //     Replace WORST with CONTRACTED_I
+        for (j = 0; j < NMD_NUM_KNOBS; j++)
+        {
+            v[vg][j] = vc[j];
+        }
+        f[vg] = fc;
+
+        const int threads = (int)(v[vg][0]);
+        const int freq_idx = (int)(v[vg][1]);
+
+        cache_update(threads, freq_idx, objectives, true);
+        return do_step_start();
+    }
+    else
+    {
+        state_ = shrink;
+        return do_step_shrink();
+    }
+}
+
+optstepresult NelderMead::do_step_contract_out(const double objectives[],
+    double knob1, double knob2)
+{
+    int j;
 #ifdef NMD_DEBUG_
-      std::cout << "[NelderMead|DEBUG] State = Contraction" << std::endl;
+    std::cout << "[NelderMead|DEBUG] State = ContractionOUT" << std::endl;
 #endif
-      fc=param;
-      //fc = objfunc(vc);
-      if (fc < f[vg]) { // f(C)<f(W)
-  	    for (j=0;j<=n-1;j++) {
-	        v[vg][j] = vc[j];
-	      }
-	      f[vg] = fc;
-      } else {
-        // apply shrinking
-	      for (row=0;row<=n;row++) {
-	        if (row != vs) {
-	          for (j=0;j<=n-1;j++) {
-	            v[row][j] = v[vs][j]+(v[row][j]-v[vs][j])/2.0;
-              my_constraints(v[row]);
-	         }
-	        }
-	      }
-      }
-      updateObjectives();
-      state_=start;
-      break;
-    }
-  }
-
-  /* print out the value at each iteration */
+    fc = evaluate_score(objectives, nullptr);
+
+    double profiled[] = {knob1, knob2};
+    my_constraints(profiled);
+
+    if ( vc[0] != profiled[0] || vc[1] != profiled[1] ) {
+        std::cout << "[NelderMead|WARN] Meant to profile contract " << vc[0] << " knob1 "
+                     "but ended up using " << profiled[0] << std::endl;
+        std::cout << "[NelderMead|WARN] Meant to profile contract " << vc[1] << " knob2 "
+                     "but ended up using " << profiled[1] << std::endl;
+        
+        auto key = std::make_pair((int)vc[0], (int)vc[1]);
+        auto iter = cache_.find(key);
+        if ( iter != cache_.end() ) {
+            iter->second.threads = profiled[0];
+            iter->second.freq_idx = profiled[1];
+        }
+
+        vc[0] = profiled[0];
+        vc[1] = profiled[1];
+
+        cache_update((int)vc[0], (int)vc[1], objectives, true);
+    }
+
+    if (fc <= fr)
+    {
+        // VV: CONTRACTED_O is better than REFLECTED
+        //     Replace WORST with CONTRACTED_O
+        for (j = 0; j < NMD_NUM_KNOBS; j++)
+        {
+            v[vg][j] = vc[j];
+        }
+        f[vg] = fc;
+
+        const int threads = (int)(v[vg][0]);
+        const int freq_idx = (int)(v[vg][1]);
+
+        cache_update(threads, freq_idx, objectives, true);
+        return do_step_start();
+    }
+    else
+    {
+        state_ = shrink;
+        return do_step_shrink();
+    }
+}
+
+optstepresult NelderMead::do_step_shrink()
+{
 #ifdef NMD_DEBUG_
-  print_iteration();
+    std::cout << "[NelderMead|DEBUG] State = Shrink" << std::endl;
 #endif
-  res.converged=testConvergence();
-  return res;
+    for (auto i=0ul; i<NMD_NUM_KNOBS+1; ++i) {
+        auto gen_new = [this, i](double *extra) mutable -> double* {      
+            for (j = 0; j < NMD_NUM_KNOBS; j++)
+                vr[j] = vm[j] + DELTA * (v[i][j] - vm[j]) - extra[j];
+        
+            my_constraints(vr);
+
+            return vr;
+        };
+        
+        generate_new(gen_new);
+    }
+
+    state_ = warmup;
+    warming_up_step = 0;
+    return do_step_warmup({}, 0, 0);
 }
 
-bool NelderMead::testConvergence(){
-
-  fsum = 0.0;
-  for (j=0;j<=n;j++) {
-    fsum += f[j];
-  }
-  favg = fsum/(n+1);
-  s = 0.0;
-  for (j=0;j<=n;j++) {
-    s += pow((f[j]-favg),2.0)/(n);
-  }
-  s = sqrt(s);
-  s = s /favg; // normalization step
-#ifdef NMD_INFO_
-  std::cout << "[NelderMead|INFO] Convergence Ratio is " << s << std::endl;
-  std::cout << "[NelderMead|INFO] Convergence Threshold set is " << EPSILON << std::endl;
-#endif
-  if (s >= EPSILON && itr <= MAXITERATIONS)
-    return false;
-  else{
-    vs = vs_index();
-    min=f[vs];
-    return true;
-  }
+optstepresult NelderMead::do_step_warmup(const double objectives[],
+            double knob1, double knob2)
+{
+    #ifdef NMD_DEBUG_
+        std::cout << "[NelderMead|DEBUG] State = Warmup " 
+                    << warming_up_step << std::endl;
+    #endif
+
+    OUT_DEBUG(
+        if ( warming_up_step == 0 ) {
+            std::cout << "[NelderMead|DEBUG] Initial exploration" << std::endl;
+
+            for ( auto i =0; i<NMD_NUM_KNOBS+1; ++i ) {
+                std::cout << "Simplex[" << i <<"]:";
+                for ( auto j=0; j<NMD_NUM_KNOBS; ++j )
+                    std::cout << " " << initial_configurations[i][j];
+                std::cout << std::endl;
+            }
+        }
+    )
+
+    // VV: Make sure that we actually profiled what we meant to
+    if ( warming_up_step > 0 && warming_up_step <= NMD_NUM_KNOBS + 1) {
+        double profiled[] = {knob1, knob2};
+        my_constraints(profiled);
+
+        if ( v[warming_up_step-1][0] != profiled[0] || v[warming_up_step-1][1] != profiled[1] ) {
+            std::cout << "[NelderMead|WARN] Meant to profile expand " << v[warming_up_step-1][0] << " knob1 "
+                        "but ended up using " << profiled[0] << std::endl;
+            std::cout << "[NelderMead|WARN] Meant to profile expand " << v[warming_up_step-1][1] << " knob2 "
+                        "but ended up using " << profiled[1] << std::endl;
+            
+            auto key = std::make_pair((int)v[warming_up_step-1][0], (int)v[warming_up_step-1][1]);
+            auto iter = cache_.find(key);
+            if ( iter != cache_.end() ) {
+                iter->second.threads = profiled[0];
+                iter->second.freq_idx = profiled[1];
+            }
+
+            v[warming_up_step-1][0] = profiled[0];
+            v[warming_up_step-1][1] = profiled[1];
+        }
+        
+        // VV: Record results of last warming up step
+        f[warming_up_step-1] = evaluate_score(objectives, nullptr);
+        cache_update(v[warming_up_step-1][0], v[warming_up_step-1][1], 
+                        objectives, true);
+    } 
+
+    if ( warming_up_step == NMD_NUM_KNOBS + 1) {
+        // VV: We need not explore the knob_set space anymore
+        state_ = start;
+        return step(objectives, knob1, knob2);
+    } else if (warming_up_step > NMD_NUM_KNOBS + 1) {
+        std::cout << "[NelderMead|Warn] Unknown warmup step " << warming_up_step << std::endl;
+    }
+    optstepresult res;
+    
+    res.objectives[0] = -1;
+    res.objectives[1] = -1;
+    res.objectives[2] = -1;
+    res.converged = false;
+
+    res.threads = initial_configurations[warming_up_step][0];
+    res.freq_idx = initial_configurations[warming_up_step][1];
+    
+    v[warming_up_step][0] = res.threads;
+    v[warming_up_step][1] = res.freq_idx;
+    warming_up_step++;
+
+    return res;
 }
 
-void NelderMead::updateObjectives(){
-  /* re-evaluate all the vertices */
-	/*for (j=0;j<=n;j++) {
-	  f[j] = objfunc(v[j]);
-	}
-  */
+optstepresult NelderMead::step(const double objectives[], 
+            double knob1, double knob2)
+{
+    int i, j;
+
+    optstepresult res;
+    res.threads = 0;
+    res.freq_idx = -1;
+    times_used_cached = 0;
+
+    OUT_DEBUG(
+        auto score = evaluate_score(objectives, nullptr);
+
+        std::cout << "[NelderMead|DEBUG] Starting step with "
+            << objectives[0] << " " 
+            << objectives[1] << " " 
+            << objectives[2] << " score " << score << std::endl;
+    )
+    
+    if ( should_update_constraints ) {
+        for (i=0; i<NMD_NUM_KNOBS; ++i )
+        {
+           constraint_min[i] = next_constraint_min[i];
+           constraint_max[i] = next_constraint_max[i];
+        }
+        should_update_constraints = false;
+    }
+
+    std::size_t tested_combinations = cache_.size();
+    
+    #if 0
+    evaluate_score(objectives, nullptr);
 
-	/* find the index of the largest value */
-	vg = vg_index();
+    for (i=0; i<NMD_NUM_KNOBS+1; ++i) {
+        auto key = std::make_pair((int)v[i][0], (int)v[i][1]);
+        auto entry = cache_.find(key);
 
-	/* find the index of the smallest value */
-	vs = vs_index();
+        if ( entry != cache_.end() ) {
+            f[i] = evaluate_score(entry->second.objectives, nullptr);
+        }
+    }
+    #endif
+
+    if ( should_invalidate_cache )
+        do_invalidate_cache();
+    
+    if ( should_reevaluate_scores )
+        do_reevaluate_scores();
+    
+    switch (state_)
+    {
+    case warmup:
+    {
+        res = do_step_warmup(objectives, knob1, knob2);
+        break;
+    }
+    break;
+    case start:
+        itr++;
+        res = do_step_start();
+        break;
+    case reflection:
+        res = do_step_reflect(objectives, knob1, knob2);
+        break;
+    case expansion:
+        res = do_step_expand(objectives, knob1, knob2);
+        break;
+    case contraction_in:
+        res = do_step_contract_in(objectives, knob1, knob2);
+        break;
+    case contraction_out:
+        res = do_step_contract_out(objectives, knob1, knob2);
+        break;
+    default:
+        std::cout << "Unknown NelderMead state (" << state_ << ")" << std::endl;
+        res.converged = false;
+        return res;
+    }
 
-	/* find the index of the second largest value */
-	vh = vh_index();
+    if ( state_ != warmup )
+    {
+        res.converged = testConvergence(tested_combinations);
+
+        if (res.converged == true)
+        {
+            res.threads = v[vs][0];
+            res.freq_idx = v[vs][1];
+            OUT_DEBUG(
+                std::cout << "[NelderMead|DEBUG] Converged to " << res.threads << " " << res.freq_idx << std::endl;
+            )
+        }
+    }
+    
+    if ( res.threads > constraint_max[0])
+        res.threads = (int) constraint_max[0];
+    else if ( res.threads < constraint_min[0])
+        res.threads = (int) constraint_min[0];
+
+    if ( res.freq_idx > constraint_max[1])
+        res.freq_idx = (int) constraint_max[1];
+    else if ( res.freq_idx < constraint_min[1])
+        res.freq_idx = (int) constraint_min[1];
+
+    std::cout << "Stop step with "
+                << objectives[0] << " " 
+                << objectives[1] << " " 
+                << objectives[2] << std::endl;
+
+    return res;
+}
 
-  my_constraints(v[vg]);
+bool NelderMead::testConvergence(std::size_t tested_combinations)
+{
+    double temp;
+    #if 0
+    int all_same = 1;
+
+    for (auto i = 0; i <= n; ++i)
+    {
+        for (auto k = i + 1; j <= n; ++k)
+            for (auto j = 0; j < n; ++j)
+                all_same &= (v[i][j] == v[k][j]);
+    }
 
-	//f[vg] = objfunc(v[vg]);
+    if (all_same)
+    {
+        min = f[vs];
+        return true;
+    }
+    #endif
+    bool ret = false;
 
-	my_constraints(v[vh]);
+    fsum = 0.0;
+    for (auto j = 0; j <= n; j++)
+    {
+        fsum += f[j];
+    }
+    favg = fsum / (n + 1);
+    s = 0.0;
+    for (auto j = 0; j <= n; j++)
+    {
+        temp = (f[j] - favg);
+        s += temp * temp / (n);
+    }
+    s = sqrt(s);
+    s = s / favg; // normalization step
+#ifdef NMD_INFO_
+    std::cout << "[NelderMead|INFO] Convergence Ratio is " << s << std::endl;
+    std::cout << "[NelderMead|INFO] Convergence Threshold set is " << EPSILON << std::endl;
+#endif
+    int max_combinations = (constraint_max[0] - constraint_min[0]+1) 
+                            * (constraint_max[1] - constraint_min[1]+1);
+
+    if ( (s >= EPSILON)
+        && (itr <= MAXITERATIONS)
+        && (max_combinations != tested_combinations) )
+        ret = false;
+    else
+    {
+        sort_vertices();
+        min = f[vs];
+
+        OUT_DEBUG(
+            std::cout << "[NelderMead|Debug] Cache_ Max: " << max_combinations 
+                        << " explored " << tested_combinations << std::endl;
+            for (const auto &entry: cache_ ) {
+                std::cout << "[NelderMead|Debug] Cache_ " 
+                    << entry.second.threads << " " 
+                    << entry.second.freq_idx << " :: "
+                    << entry.second.objectives[0] << " "
+                    << entry.second.objectives[1] << " "
+                    << entry.second.objectives[2] << " :: "
+                    << evaluate_score(entry.second.objectives, nullptr) << std::endl;
+            }
+        )
+
+        ret = true;
+    }
 
-  //f[vh] = objfunc(v[vh]);
-}
+    if ( ret == true && convergence_reevaluating == true ) {
+        // VV: Now find the best result from cache
+        sort_vertices();
 
-}
-}
-/*
+        double best_knobs[NMD_NUM_KNOBS] = { v[vs][0], v[vs][1]};
+        double best_score = f[vs];
 
-std::vector<double> NelderMead::minimum(){
+        for ( const auto & entry: cache_ ) {
+            auto cur_score = evaluate_score(entry.second.objectives, nullptr);
+            if ( cur_score < best_score) {
+                best_knobs[0] = entry.second.threads;
+                best_knobs[1] = entry.second.freq_idx;
 
+                best_score = cur_score;
+            }
+        }
 
-  free(f);
-  free(vr);
-  free(ve);
-  free(vc);
-  free(vm);
-  for (i=0;i<=n;i++) {
-    free (v[i]);
-  }
-  free(v);
-  return min;
+        v[vs][0] = best_knobs[0];
+        v[vs][1] = best_knobs[1];
+        f[vs] = best_score;
+        return true;
+    } else if ( ret == true ) {
+        // VV: Do another final run to make sure that the objective scores still hold up
+        OUT_DEBUG (
+            std::cout << "[NelderMead|Debug] Doing another final search" << std::endl;
+        )
+        state_ = warmup;
+        warming_up_step = 0;
+        itr --;
+        convergence_reevaluating = true;
+        std::vector<optstepresult> fresh;
+
+        for ( const auto &entry: cache_ ) {
+            fresh.push_back(entry.second);
+        }
 
+        cache_.clear();
 
-}
-*/
+        std::sort(fresh.begin(), fresh.end(), 
+            [this](const optstepresult &l, const optstepresult &r) mutable -> int {
+                return evaluate_score(l.objectives, nullptr) < 
+                        evaluate_score(r.objectives, nullptr);
+            });
 
+        for (auto i=0ul; i<NMD_NUM_KNOBS+1; ++i ) {
+            v[i][0] = fresh[i].threads;
+            v[i][1] = fresh[i].freq_idx;
+        }
 
+        vs = 0;
+        vh = 1;
+        vg = 2;
 
+        for (auto i=0; i<NMD_NUM_KNOBS+1; ++i ) {
+            for (auto j=0; j<NMD_NUM_KNOBS; ++j) {
+                initial_configurations[i][j] = v[i][j];
+            }
+        }
+        centroid();
+        
+        OUT_DEBUG (
+            print_initial_simplex();
+        )
+
+        return false;
+    } else {
+        return false;
+    }
+}
 
+} // namespace components
+} // namespace allscale
diff --git a/src/components/scheduler_component.cpp b/src/components/scheduler_component.cpp
index 7185b23..1a5ae8e 100644
--- a/src/components/scheduler_component.cpp
+++ b/src/components/scheduler_component.cpp
@@ -22,12 +22,12 @@
 
 //#define DEBUG_ 1
 //#define DEBUG_INIT_ 1 // define to generate output during scheduler initialization
-//#define DEBUG_MULTIOBJECTIVE_ 1
+// #define DEBUG_MULTIOBJECTIVE_ 1
 //#define DEBUG_THREADTHROTTLING_ 1
 //#define DEBUG_THREADSTATUS_ 1
 //#define DEBUG_FREQSCALING_ 1
 //#define MEASURE_MANUAL 1 // define to generate output consumed by the regression test
-#define MEASURE_ 1
+// #define MEASURE_ 1
 // only meant to be defined if one needs to measure the efficacy
 // of the scheduler
 #undef DEBUG_
@@ -50,24 +50,16 @@ scheduler::scheduler(std::uint64_t rank)
       current_power_usage(0),
       last_power_usage(0),
       power_sum(0),
-      power_count(0)
-
-#if defined(ALLSCALE_HAVE_CPUFREQ)
-      ,
-      target_freq_found(false)
-#endif
-      ,
-      resource_step(1),
-      target_resource_found(false),
-      sampling_interval(10),
+      power_count(0),
+      sampling_interval(3),
       current_avg_iter_time(0.0),
       multi_objectives(false),
       time_requested(false),
       resource_requested(false),
       energy_requested(false),
-      time_leeway(1.0),
-      resource_leeway(1.0),
-      energy_leeway(1.0),
+      time_weight(0.0),
+      resource_weight(0.0),
+      energy_weight(0.0),
       period_for_time(10),
       period_for_resource(10),
       period_for_power(20),
@@ -87,6 +79,7 @@ scheduler::scheduler(std::uint64_t rank)
 #endif
       ,
       nr_opt_steps(0),
+      last_objective_score(-1.0),
       uselopt(false)
   {
   allscale_monitor = &allscale::monitor::get();
@@ -101,10 +94,11 @@ scheduler::scheduler(std::uint64_t rank)
 #ifdef DEBUG_KOSTAS
   std::cout << "DEBUG_KOSTAS is defined" << std::endl << std::flush;
 #endif
-#ifdef ALLSCALE_HAVE_CPUFREQ_
-  std::cout << "ALLSCALE_HAVE_CPUFREQ_ is defined" << std::endl << std::flush;
+#ifdef ALLSCALE_HAVE_CPUFREQ
+  std::cout << "ALLSCALE_HAVE_CPUFREQ is defined" << std::endl << std::flush;
+#else
+  std::cout << "ALLSCALE_HAVE_CPUFREQ is not defined. No real power measurements or CPU frequency scaling" << std::endl << std::flush;
 #endif
-
 }
 
 /**
@@ -193,22 +187,17 @@ std::size_t scheduler::get_num_numa_cores(std::size_t domain) {
  *
 */
 void scheduler::init() {
-
-  std::vector<objectiveType> objectives_priorities;
-  int objectives_priority_idx=0;
-
   std::size_t num_localities = allscale::get_num_localities();
 
   std::unique_lock<mutex_type> l(resize_mtx_);
   hpx::util::ignore_while_checking<std::unique_lock<mutex_type>> il(&l);
+
   if (initialized_)
     return;
 
 #ifdef MEASURE_
-  update_active_osthreads(0);
-#ifdef ALLSCALE_HAVE_CPUFREQ
-  update_power_consumption(hardware_reconf::read_system_power());
-#endif
+  last_measure_power = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+  last_measure_threads = last_measure_power;
 #endif
 
   rp_ = &hpx::resource::get_partitioner();
@@ -223,8 +212,6 @@ void scheduler::init() {
         )
     );
 
-//   std::cout << "init: " << num_cores << " " << allscale::get_num_localities() << " " << depth_cut_off_ << '\n';
-
   // Reading user provided options in terms of desired optimization objectives
   std::string input_objective_str =
       hpx::get_config_entry("allscale.objective", "");
@@ -232,17 +219,29 @@ void scheduler::init() {
   /* Read optimization policy selected by the user. If not specified,
      allscale policy is the default */
     std::string input_optpolicy_str =
-      hpx::get_config_entry("allscale.policy", "allscale");
+      hpx::get_config_entry("allscale.policy", "none");
+      if ( input_optpolicy_str == "none" ){
+        char *c_optpolicy = std::getenv("ALLSCALE_LOCAL_OPTIMIZER");
+        if ( c_optpolicy) 
+          input_optpolicy_str = std::string(c_optpolicy);
+      }
+
+
+    uselopt=false;
 #ifdef DEBUG_MULTIOBJECTIVE_
     std::cout << "[Local Optimizer|INFO] Optimization Policy Active = " << input_optpolicy_str << std::endl;
 #endif
-    if (input_optpolicy_str=="allscale")
-      lopt_.setPolicy(allscale);
-    else if (input_optpolicy_str=="random")
+  if (input_optpolicy_str=="allscale")
+		lopt_.setPolicy(allscale);
+  else 	if (input_optpolicy_str=="random")
       lopt_.setPolicy(random);
-    else if (input_optpolicy_str=="manual")
+  else if (input_optpolicy_str=="manual")
       lopt_.setPolicy(manual);
-    else lopt_.setPolicy(allscale);
+ 	else if ( input_optpolicy_str != "none" ) {
+		HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init", 
+							"unknown allscale.policy");
+	}
+
 
 #ifdef MEASURE_MANUAL_
   std::string input_osthreads_str =
@@ -265,6 +264,12 @@ void scheduler::init() {
   }
 #endif
 
+  if (input_objective_str.empty() ){
+    char *c_opt_objective = std::getenv("ALLSCALE_LOCAL_OBJECTIVE");
+    if ( c_opt_objective )
+      input_objective_str = std::string(c_opt_objective);
+  }
+
   if (!input_objective_str.empty()) {
     uselopt=true;
     std::istringstream iss_leeways(input_objective_str);
@@ -276,95 +281,54 @@ void scheduler::init() {
 #ifdef DEBUG_INIT_
       std::cout << "Scheduling Objective provided: " << obj << "\n";
 #endif
-      // Don't scale objectives if none is given
-      double leeway = 1.0;
+      // VV: Don't scale objectives if none is given
+      double opt_weight = 1.0;
 
       if (idx != std::string::npos) {
 #ifdef DEBUG_INIT_
-        std::cout << "Found a leeway, triggering multi-objectives policies\n"
-                  << std::flush;
+        std::cout << "Found an optimization weight, triggering " 
+                     "multi-objectives policies\n" << std::flush;
 #endif
 
         multi_objectives = true;
         obj = objective_str.substr(0, idx);
-        leeway = std::stod(objective_str.substr(idx + 1));
+        opt_weight = std::stod(objective_str.substr(idx + 1));
       }
 
       if (obj == "time") {
           time_requested = true;
-          objectives_priorities.push_back(time);
-#ifdef DEBUG_INIT_
-          std::cout << "Priority[" << objectives_priority_idx << "]=" << objectives_priorities[objectives_priority_idx]
-          << std::endl;
-#endif
-          time_leeway = leeway;
+          time_weight = opt_weight;
 #ifdef DEBUG_INIT_
-          std::cout << "Set time margin to " << time_leeway << "\n" << std::flush;
+          std::cout << "Set time weight to " << time_weight << "\n" << std::flush;
 #endif
-
       } else if (obj == "resource") {
-          resource_requested = true;
-          objectives_priorities.push_back(resource);
-#ifdef DEBUG_INIT_
-          std::cout << "Priority[" << objectives_priority_idx << "]=" << objectives_priorities[objectives_priority_idx]
-          << std::endl;
-#endif
-        resource_leeway = leeway;
+        resource_requested = true;
+        resource_weight = opt_weight;
 #ifdef DEBUG_INIT_
-        std::cout << "Set resource margin to " << resource_leeway << "\n"
+        std::cout << "Set resource weight to " << resource_weight << "\n"
                   << std::flush;
-        ;
 #endif
 
       } else if (obj == "energy") {
-          energy_requested = true;
-          objectives_priorities.push_back(energy);
+        energy_requested = true;
+        energy_weight = opt_weight;
 #ifdef DEBUG_INIT_
-          std::cout << "Priority[" << objectives_priority_idx << "]=" << objectives_priorities[objectives_priority_idx]
-          << std::endl;
-#endif
-        energy_leeway = leeway;
-#ifdef DEBUG_INIT_
-        std::cout << "Set energy margin to " << energy_leeway << "\n"
+        std::cout << "Set energy weight to " << energy_weight << "\n"
                   << std::flush;
-        ;
 #endif
       } else {
-        std::ostringstream all_keys;
-        copy(scheduler::objectives.begin(), scheduler::objectives.end(),
-             std::ostream_iterator<std::string>(all_keys, ","));
-        std::string keys_str = all_keys.str();
-        keys_str.pop_back();
+        std::cout << "TRIED PARSING \"" << obj << "\"" << std::endl;
         HPX_THROW_EXCEPTION(
             hpx::bad_request, "scheduler::init",
             boost::str(
-                boost::format("Wrong objective: %s, Valid values: [%s]") % obj %
-                keys_str));
+                boost::format("Wrong objective: Valid values: [time, energy, resource]")));
       }
 
-      if (time_leeway > 1 || resource_leeway > 1 || energy_leeway > 1) {
+      if (time_weight > 2 || resource_weight > 2 || energy_weight > 2
+          || time_weight < -2 || resource_weight < -2 || energy_weight < -2) {
         HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init",
-                            "leeways should be within ]0, 1]");
+                            "Objective weights should be within [-2, 2]");
       }
-      objectives_priority_idx++;
-    }
-  }
-  objectives_priority_idx--;
-
-  /* Reading optional user provided input for granularity (step) of
-     adding/removing resources to/from the runtime (where resource=OS thread) */
-  std::string input_resource_step_str =
-      hpx::get_config_entry("allscale.resource_step", "");
-  if (!input_resource_step_str.empty()) {
-
-    resource_step = std::stoul(input_resource_step_str);
-#ifdef DEBUG_INIT_
-    std::cout << "Resource step provided : " << resource_step << "\n";
-#endif
-    if (resource_step == 0 || resource_step >= os_thread_count) {
-      HPX_THROW_EXCEPTION(
-          hpx::bad_request, "scheduler::init",
-          "resource step should be within ]0, total nb threads[");
     }
   }
 
@@ -393,18 +357,14 @@ void scheduler::init() {
     executors_.emplace_back(pool_name);
   }
 
-#if defined(ALLSCALE_HAVE_CPUFREQ)
   if (multi_objectives) {
-    // reallocating objectives_status vector of vectors
-    objectives_status.resize(3);
-    for (int i = 0; i < 3; i++) {
-      objectives_status[i].resize(3);
-    }
-#ifdef DEBUG_INIT_
+
+    #ifdef DEBUG_INIT_
     std::cout << "\n****************************************************\n" << std::flush;
-    std::cout << "Policy selected: multi-objective set with time=" << time_leeway
-              << ", resource=" << resource_leeway
-              << ", energy=" << energy_leeway << "\n"
+    std::cout << "Policy selected: multi-objective set with time=" << time_weight
+              << ", energy=" << energy_weight 
+              << ", resource=" << resource_weight
+              << "\n"
               << std::flush;
     std::cout << "Objectives Flags Set: \n" <<
               "\tTime: " << time_requested <<
@@ -413,18 +373,16 @@ void scheduler::init() {
               "\tMulti-objective: " << multi_objectives <<
               "\n" << std::flush;
     std::cout << "****************************************************\n" << std::flush;
-#endif
+    #endif
   }
 
   if (energy_requested)
     initialize_cpu_frequencies();
 
-#ifdef MEASURE_MANUAL_
+  #ifdef MEASURE_MANUAL_
   if (manual_input_provided && input_objective_str.empty())
       fix_allcores_frequencies(temp_idx);
-#endif
-
-#endif
+  #endif
 
   initialized_ = true;
 #ifdef DEBUG_INIT_
@@ -442,64 +400,31 @@ void scheduler::init() {
     last_optimization_timestamp_ = t_duration_now;
     last_objective_measurement_timestamp_= t_duration_now;
 
-    std::list<objective> objectives_temp;
-    if (energy_requested){
-      objective o_temp;
-      o_temp.type=energy;
-      o_temp.leeway=energy_leeway;
-      int i=0;
-      for(auto& el: objectives_priorities){
-        if (el==energy){
-          o_temp.priority=i;
-          break;
-        }
-        ++i;
-      }
-      objectives_temp.push_back(o_temp);
-    }
-    if (time_requested){
-      objective o_temp;
-      o_temp.type=time;
-      o_temp.leeway=time_leeway;
-      int i=0;
-      for(auto& el: objectives_priorities){
-        if (el==time){
-          o_temp.priority=i;
-          break;
-        }
-        ++i;
-      }
-      objectives_temp.push_back(o_temp);
-    }
-    if (resource_requested){
-      objective o_temp;
-      o_temp.type=resource;
-      o_temp.leeway=resource_leeway;
-      int i=0;
-      for(auto& el: objectives_priorities){
-        if (el==resource){
-          o_temp.priority=i;
-          break;
-        }
-        ++i;
-      }
-      objectives_temp.push_back(o_temp);
-    }
-    lopt_.setobjectives(objectives_temp);
     lopt_.setmaxthreads(os_thread_count);
-    lopt_.reset(os_thread_count,0);
-  #if defined(ALLSCALE_HAVE_CPUFREQ)
+
+    #if defined(ALLSCALE_HAVE_CPUFREQ)
     using hardware_reconf = allscale::components::util::hardware_reconf;
-    std::vector<unsigned long> freq_temp =
-      lopt_.setfrequencies(hardware_reconf::get_frequencies(0));
+    auto  freqs = hardware_reconf::get_frequencies(0);
+
+    auto freq_temp = lopt_.setfrequencies(freqs);
     if (freq_temp.empty()){
       HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init",
       "error in initializing the local optimizer, allowed frequency values are empty");
     }
-  #endif
-#ifdef DEBUG_
+    // VV: Set to max number of threads and max frequency
+    lopt_.reset(os_thread_count, freqs.size()-1);
+    #else
+    // VV: Max number of threads, and an arbitrary frequency index
+    lopt_.reset(os_thread_count,0);
+    auto freq_temp = lopt_.setfrequencies({0});
+    #endif
+    
+    // VV: Set objectives after setting all constraints to
+    //     trigger the initialization of nmd
+    lopt_.setobjectives(time_weight, energy_weight, resource_weight);
+    #ifdef DEBUG_
     lopt_.printobjectives();
-#endif
+    #endif
   }
 }
 
@@ -512,16 +437,13 @@ void scheduler::init() {
  * potential.
  *
 */
-void scheduler::initialize_cpu_frequencies() {
 #if defined(ALLSCALE_HAVE_CPUFREQ)
+void scheduler::initialize_cpu_frequencies() 
+{
   using hardware_reconf = allscale::components::util::hardware_reconf;
   cpu_freqs = hardware_reconf::get_frequencies(0);
-  freq_step = 8; // cpu_freqs.size() / 2;
-  freq_times.resize(cpu_freqs.size());
-
-#ifdef MEASURE_
-#ifdef ALLSCALE_HAVE_CPUFREQ
-#ifdef DEBUG_INIT_
+  
+  #if defined(MEASURE_) && defined(DEBUG_INIT)
   unsigned long temp_transition_latency=hardware_reconf::get_cpu_transition_latency(1);
   if (temp_transition_latency==0)
     std::cout << "[INFO] Transition Latency Unavailable" <<
@@ -530,45 +452,37 @@ void scheduler::initialize_cpu_frequencies() {
     std::cout << "[INFO] Core-1 Frequency Transition Latency = " <<
       hardware_reconf::get_cpu_transition_latency(2)/1000 <<
       " milliseconds\n" << std::flush;
-#endif
-#endif
-#endif
 
-#ifdef DEBUG_INIT_
+  #endif
+
+  #ifdef DEBUG_INIT_
   std::cout << "[INFO] Governors available on the system: " <<
       "\n" << std::flush;
-#ifdef ALLSCALE_HAVE_CPUFREQ
   std::vector<std::string> temp_governors = hardware_reconf::get_governors(0);
   for (std::vector<std::string>::const_iterator i = temp_governors.begin(); i != temp_governors.end(); ++i)
     std::cout << "[INFO]\t" << *i << "\n" << std::flush;
-#endif
   std::cout << "\n" << std::flush;
-#endif
 
-#ifdef DEBUG_INIT_
   std::cout << "Server Processor Available Frequencies (size = " << cpu_freqs.size() << ")";
   for (auto &ind : cpu_freqs) {
     std::cout << ind << " ";
   }
   std::cout << "\n" << std::flush;
-#endif
+  #endif
 
   auto min_max_freqs = std::minmax_element(cpu_freqs.begin(), cpu_freqs.end());
   min_freq = *min_max_freqs.first;
   max_freq = *min_max_freqs.second;
-
-#ifdef DEBUG_INIT_
-  std::cout << "Min freq:  " << min_freq << ", Max freq: " << max_freq << "\n"
-            << std::flush;
-#endif
   // TODO: verify that nbpus == all pus of the system, not just the online
   // ones
   size_t nbpus = topo_->get_number_of_pus();
-#ifdef DEBUG_INIT_
+
+  #ifdef DEBUG_INIT_
+  std::cout << "Min freq:  " << min_freq << ", Max freq: " << max_freq << "\n"
+            << std::flush;
   std::cout << "nbpus known to topo_:  " << nbpus << "\n" << std::flush;
-#endif
+  #endif
 
-#ifdef ALLSCALE_HAVE_CPUFREQ
   hardware_reconf::make_cpus_online(0, nbpus);
   hardware_reconf::topo_init();
   // We have to set CPU governors to userpace in order to change frequencies
@@ -579,13 +493,12 @@ void scheduler::initialize_cpu_frequencies() {
 
   topo = hardware_reconf::read_hw_topology();
   // first reinitialize to a normal setup
-  for (unsigned int cpu_id = 0; cpu_id < topo.num_logical_cores; cpu_id++) {
+  for (unsigned int cpu_id = 0; cpu_id < topo.num_logical_cores; cpu_id++){
     hardware_reconf::set_freq_policy(cpu_id, policy);
-#ifdef DEBUG_INIT_
-    std::cout << "cpu_id " << cpu_id << " back to on-demand. ret=  " << res
-              << "\n"
-              << std::flush;
-#endif
+    #ifdef DEBUG_INIT_
+    std::cout << "cpu_id " << cpu_id << " back to on-demand. ret=  " 
+              << res << std::endl;
+    #endif
   }
 
   governor = "userspace";
@@ -593,8 +506,10 @@ void scheduler::initialize_cpu_frequencies() {
   policy.min = min_freq;
   policy.max = max_freq;
 
-  for (unsigned int cpu_id = 0; cpu_id < topo.num_logical_cores;
-       cpu_id += topo.num_hw_threads) {
+  for (unsigned int cpu_id = 0; 
+       cpu_id < topo.num_logical_cores;
+       cpu_id += topo.num_hw_threads) 
+  {
     int res = hardware_reconf::set_freq_policy(cpu_id, policy);
     if (res) {
       HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init",
@@ -603,34 +518,29 @@ void scheduler::initialize_cpu_frequencies() {
 
       return;
     }
-#ifdef DEBUG_INIT_
+  #ifdef DEBUG_INIT_
     std::cout << "cpu_id " << cpu_id
               << " initial freq policy setting. ret=  " << res << "\n"
               << std::flush;
-#endif
+  #endif
   }
-#endif
-
   // Set frequency of all threads to max when we start
 
-  {
-    // set freq to all PUs used by allscale
-    for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
-      std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
-      for (std::size_t j = 0; j < thread_count; j++) {
-        std::size_t pu_num =
-            rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
+  // set freq to all PUs used by allscale
+  for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
+    std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
+    for (std::size_t j = 0; j < thread_count; j++) {
+      std::size_t pu_num =
+          rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
 
-#ifdef ALLSCALE_HAVE_CPUFREQ
-        if (!cpufreq_cpu_exists(pu_num)) {
-          hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[0]);
-#ifdef DEBUG_INIT_
-          std::cout << "Setting cpu " << pu_num << " to freq  " << cpu_freqs[0]
-                    << ", (ret= " << res << ")\n"
-                    << std::flush;
-#endif
-        }
-#endif
+
+      if (!cpufreq_cpu_exists(pu_num)) {
+        hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[0]);
+        #ifdef DEBUG_INIT_
+        std::cout << "Setting cpu " << pu_num << " to freq  " << cpu_freqs[0]
+                  << ", (ret= " << res << ")\n"
+                  << std::flush;
+        #endif
       }
     }
   }
@@ -639,37 +549,33 @@ void scheduler::initialize_cpu_frequencies() {
 
   // Make sure frequency change happened before continuing
   std::cout << "topo.num_logical_cores: " << topo.num_logical_cores
-            << "topo.num_hw_threads" << topo.num_hw_threads << "\n"
+            << " topo.num_hw_threads" << topo.num_hw_threads << "\n"
             << std::flush;
-  {
-    // check status of Pus frequency
-#ifdef ALLSCALE_HAVE_CPUFREQ
-    for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
-      unsigned long hardware_freq = 0;
-      std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
-      for (std::size_t j = 0; j < thread_count; j++) {
-        std::size_t pu_num =
-            rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
-
-        if (!cpufreq_cpu_exists(pu_num)) {
-          do {
-            hardware_freq = hardware_reconf::get_hardware_freq(pu_num);
-#ifdef DEBUG_INIT_
-            std::cout << "current freq on cpu " << pu_num << " is "
-                      << hardware_freq << " (target freq is " << cpu_freqs[0]
-                      << " )\n"
-                      << std::flush;
-
-#endif
+      // check status of Pus frequency
+    
+  for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
+    unsigned long hardware_freq = 0;
+    std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
+    for (std::size_t j = 0; j < thread_count; j++) {
+      std::size_t pu_num =
+          rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
+
+      if (!cpufreq_cpu_exists(pu_num)) {
+        do {
+          hardware_freq = hardware_reconf::get_hardware_freq(pu_num);
+        #ifdef DEBUG_INIT_
+          std::cout << "current freq on cpu " << pu_num << " is "
+                    << hardware_freq << " (target freq is " << cpu_freqs[0]
+                    << " )\n"
+                    << std::flush;
+        #endif
 
-          } while (hardware_freq != cpu_freqs[0]);
-        }
+        } while (hardware_freq != cpu_freqs[0]);
       }
     }
-#endif
   }
 
-#ifdef ALLSCALE_USE_CORE_OFFLINING
+  #ifdef ALLSCALE_USE_CORE_OFFLINING
   // offline unused cpus
   for (unsigned int cpu_id = 0; cpu_id < topo.num_logical_cores;
        cpu_id += topo.num_hw_threads) {
@@ -682,25 +588,23 @@ void scheduler::initialize_cpu_frequencies() {
     }
 
     if (!found_it) {
-#ifdef DEBUG_INIT_
+      #ifdef DEBUG_INIT_
       std::cout << " setting cpu_id " << cpu_id << " offline \n" << std::flush;
-#endif
+      #endif
 
-#ifdef ALLSCALE_HAVE_CPUFREQ
       hardware_reconf::make_cpus_offline(cpu_id, cpu_id + topo.num_hw_threads);
-#endif
     }
   }
-#endif
-
+  #endif
+}
 #else
-  // should we really abort or should we reset energy to 1 ?
-  HPX_THROW_EXCEPTION(
-      hpx::bad_request, "scheduler::init",
-      "Requesting energy objective without having compiled with cpufreq");
-#endif
+void scheduler::initialize_cpu_frequencies() 
+{
+    cpu_freqs.clear();
+    // VV: Bogus frequency
+    cpu_freqs.push_back(1000*1024);
 }
-
+#endif
 
 /**
  *
@@ -717,9 +621,7 @@ void scheduler::optimize_locally(work_item const& work)
         // find out which pool has the most threads
 
         /* Count Active threads for validation*/
-
         hpx::threads::mask_type active_mask;
-        std::size_t active_threads_ = 0;
         std::size_t domain_active_threads = 0;
         std::size_t pool_idx = 0;
         int total_threads_counted=0;
@@ -736,21 +638,13 @@ void scheduler::optimize_locally(work_item const& work)
             }
         }
         std::cout << "Active OS Threads = " <<  total_threads_counted << std::endl;
-#endif
 
-#ifdef MEASURE_
-#ifdef ALLSCALE_HAVE_CPUFREQ
-        std::size_t temp_id = work.id().id;
-        if ((temp_id >= period_for_power) &&
-                (temp_id % period_for_power == 0))
-            update_power_consumption(hardware_reconf::read_system_power());
-#endif
 #endif
 
-#ifdef ALLSCALE_HAVE_CPUFREQ
-        if (uselopt && !lopt_.isConverged()){
+        if (uselopt && !lopt_.isConverged()) {
             last_power_usage++;
-            current_power_usage = hardware_reconf::read_system_power();
+            allscale::components::monitor *monitor_c = &allscale::monitor::get();
+            current_power_usage = monitor_c->get_current_power();
             power_sum += current_power_usage;
 
             auto t_now = std::chrono::system_clock::now();
@@ -760,6 +654,10 @@ void scheduler::optimize_locally(work_item const& work)
 
             long elapsedTimeMs = t_duration_now - last_objective_measurement_timestamp_;
 
+            auto dt_power = t_duration_now - last_measure_power;
+            last_measure_power = t_duration_now;
+            update_power_consumption(power_sum/last_power_usage, dt_power);
+
             if (elapsedTimeMs > objective_measurement_period_ms){
                 last_objective_measurement_timestamp_= t_duration_now;
 
@@ -773,60 +671,90 @@ void scheduler::optimize_locally(work_item const& work)
 #endif
                     current_avg_iter_time = 0.0;
                 }
-
-                lopt_.measureObjective(current_avg_iter_time,power_sum/last_power_usage,
+                double last_objectives[] = {current_avg_iter_time,power_sum/(last_power_usage*monitor_c->get_max_power()),
+                        active_threads};
+                lopt_.measureObjective(current_avg_iter_time,power_sum/(last_power_usage*monitor_c->get_max_power()),
                         active_threads);
-                last_power_usage=0;
-                power_sum=0;
+
+                last_objective_score = lopt_.evaluate_score(last_objectives);
+
+                auto power_dt = t_duration_now - last_measure_power;
+                update_power_consumption(power_sum/last_power_usage, power_dt);
+                last_measure_power = t_duration_now;
+
+                // VV: instead of starting from scratch, remember the last power measurement
+                last_power_usage=1;
+                power_sum=current_power_usage;
             }
 
             elapsedTimeMs = t_duration_now - last_optimization_timestamp_;
 
-            if (elapsedTimeMs > optimization_period_ms){
+            if (elapsedTimeMs > optimization_period_ms || nr_opt_steps == 0){
                 last_optimization_timestamp_= t_duration_now;
                 nr_opt_steps++;
-                actuation act_temp = lopt_.step();
+                actuation act_temp = lopt_.step(active_threads);
 #ifdef DEBUG_MULTIOBJECTIVE_
                 lopt_.printverbosesteps(act_temp);
 #endif
-                // amend threads if signaled
-                /*
-                if (act_temp.delta_threads<0){
-                    unsigned int suspended_temp =
-                        suspend_threads(-1 * act_temp.delta_threads);
-                    lopt_.setCurrentThreads(lopt_.getCurrentThreads()-suspended_temp);
+                auto dt_threads = t_duration_now - last_measure_threads;
+                update_active_osthreads(active_threads, dt_threads);
+                last_measure_threads = t_duration_now;
+                if (act_temp.threads < active_threads){
+                    suspend_threads(active_threads-act_temp.threads);
                 }
-                else if (act_temp.delta_threads>0){
-                    unsigned int resumed_temp =
-                        resume_threads(act_temp.delta_threads);
-                    lopt_.setCurrentThreads(lopt_.getCurrentThreads()+resumed_temp);
+                else if (act_temp.threads > active_threads){
+                    resume_threads(act_temp.threads - active_threads);
                 }
-                */
-
-                if (act_temp.delta_threads < active_threads){
-#ifdef DEBUG_MULTIOBJECTIVE_
-                    int new_threads_target = (int)active_threads - act_temp.delta_threads;
-                    std::cout << "[SCHEDULER|INFO]: Optimizer induced threads to suspend: " << new_threads_target << std::endl;
-                    std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << ", target threads = " << act_temp.delta_threads << std::endl;
-#endif
-                    //unsigned int suspended_temp = suspend_threads(new_threads_target);
-                    //lopt_.setCurrentThreads(lopt_.getCurrentThreads()-suspended_temp);
+                fix_allcores_frequencies(act_temp.frequency_idx);
+                lopt_.setCurrentFrequencyIdx(act_temp.frequency_idx);
+                lopt_.setCurrentThreads(active_threads);
 
-                    lopt_.setCurrentThreads(active_threads);
-                }
-                else if (act_temp.delta_threads > active_threads){
 #ifdef DEBUG_MULTIOBJECTIVE_
-                    int new_threads_target = act_temp.delta_threads - (int)active_threads;
-                    std::cout << "[SCHEDULER|INFO]: Optimizer induced threads to resume to: " << new_threads_target << std::endl;
-                    std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << ", target threads = " << act_temp.delta_threads << std::endl;
+                    std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << " out of " << lopt_.getmaxthreads() 
+                    << " , target threads = " << act_temp.threads << std::endl;
 #endif
-                    fix_allcores_frequencies(act_temp.frequency_idx);
-                    lopt_.setCurrentFrequencyIdx(act_temp.frequency_idx);
-                }
             }
-        } // uselopt
-#endif
-    }
+        } 
+    #ifdef MEASURE_
+        else {
+          auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+          auto dt = timestamp_now - last_measure_power;
+          if ( dt >= 1000 ) {
+            allscale::components::monitor *monitor_c = &allscale::monitor::get();
+            auto cur_power = monitor_c->get_current_power();
+
+            update_power_consumption(cur_power, dt);
+            last_measure_power = timestamp_now;
+          }
+        }
+    #endif
+  }
+}
+
+
+void scheduler::update_max_threads(std::size_t max_threads)
+{
+  std::cout << "Will try to set max threads to " << max_threads <<std::endl;
+  if (uselopt)
+    lopt_.setmaxthreads(max_threads);
+  else if (active_threads > max_threads )
+    suspend_threads(active_threads - max_threads);
+  else if ( active_threads < max_threads )
+    resume_threads(max_threads - active_threads);
+}
+
+void scheduler::set_local_optimizer_weights(double time_weight, 
+                                         double energy_weight,
+                                         double resource_weight)
+{
+    lopt_.setobjectives(time_weight, energy_weight, resource_weight);
+}
+
+void scheduler::get_local_optimizer_weights(double *time_weight,
+                                           double *energy_weight,
+                                           double *resource_weight)
+{
+    lopt_.getobjectives(time_weight, energy_weight, resource_weight);
 }
 
 std::pair<work_item, std::unique_ptr<data_item_manager::task_requirements_base>> scheduler::schedule_local(work_item work,
@@ -1057,10 +985,6 @@ unsigned int scheduler::suspend_threads(std::size_t suspendthreads) {
   std::cout << "total active PUs: " << active_threads_ << "\n";
 #endif
 
-#ifdef MEASURE_
-  update_active_osthreads(active_threads_-active_threads);
-#endif
-
   active_threads = active_threads_;
 
   growing = false;
@@ -1122,9 +1046,6 @@ unsigned int scheduler::suspend_threads(std::size_t suspendthreads) {
             )
         );
   }
-#ifdef MEASURE_
-  update_active_osthreads(-1 * suspend_threads.size());
-#endif
 
   active_threads = active_threads - suspend_threads.size();
 
@@ -1243,10 +1164,6 @@ unsigned int scheduler::resume_threads(std::size_t resumethreads) {
   std::cout << "total active PUs: " << active_threads_ << "\n";
 #endif
 
-#ifdef MEASURE_
-  update_active_osthreads(active_threads_-active_threads);
-#endif
-
   active_threads = active_threads_;
   // if no thread is suspended, nothing to do
   if (domain_blocked_threads == 0) {
@@ -1302,9 +1219,6 @@ unsigned int scheduler::resume_threads(std::size_t resumethreads) {
             )
         );
   }
-#ifdef MEASURE_
-  update_active_osthreads(resume_threads.size());
-#endif
   active_threads = active_threads + resume_threads.size();
 #ifdef DEBUG_THREADSTATUS_
   std::cout << "[SCHEDULER|INFO]: Thread Resume - Newly Active Threads: " << active_threads
@@ -1334,9 +1248,9 @@ void scheduler::fix_allcores_frequencies(int frequency_idx){
   // ones
 
   size_t nbpus = topo_->get_number_of_pus();
-#ifdef DEBUG_FREQSCALING_
+  #ifdef DEBUG_FREQSCALING_
   std::cout << "nbpus known to topo_:  " << nbpus << "\n" << std::flush;
-#endif
+  #endif
 
   hardware_reconf::make_cpus_online(0, nbpus);
   hardware_reconf::topo_init();
@@ -1357,117 +1271,104 @@ void scheduler::fix_allcores_frequencies(int frequency_idx){
                           "set cpu frequency");
       return;
     }
-#ifdef DEBUG_FREQSCALING_
+  #ifdef DEBUG_FREQSCALING_
     std::cout << "cpu_id " << cpu_id
               << " initial freq policy setting. ret=  " << res << "\n"
               << std::flush;
-#endif
+  #endif
   }
 
-
-  {
-    // set freq of all cores used to min
-    for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
-      std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
-      for (std::size_t j = 0; j < thread_count; j++) {
-        std::size_t pu_num =
-            rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
-
-        if (!cpufreq_cpu_exists(pu_num)) {
-          //int res = hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[cpu_freqs[.size()-1]]);
-          int res = hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[frequency_idx]);
-          (void)res;
-#if defined(MEASURE_MANUAL_)
-          fixed_frequency_ = cpu_freqs[frequency_idx];
-#endif
-#ifdef DEBUG_FREQSCALING_
-          //std::cout << "Setting cpu " << pu_num << " to freq  " << cpu_freqs[cpu_freqs.size()-1]
-          std::cout << "Setting cpu " << pu_num << " to freq  " << cpu_freqs[frequency_idx]
-                    << ", (ret= " << res << ")\n"
-                    << std::flush;
-#endif
-        }
+  // set freq of all cores used to min
+  for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
+    std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
+    for (std::size_t j = 0; j < thread_count; j++) {
+      std::size_t pu_num =
+          rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
+
+      if (!cpufreq_cpu_exists(pu_num)) {
+        //int res = hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[cpu_freqs[.size()-1]]);
+        int res = hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[frequency_idx]);
+        (void)res;
+        #if defined(MEASURE_MANUAL_)
+        fixed_frequency_ = cpu_freqs[frequency_idx];
+        #endif
+        #ifdef DEBUG_FREQSCALING_
+        //std::cout << "Setting cpu " << pu_num << " to freq  " << cpu_freqs[cpu_freqs.size()-1]
+        std::cout << "Setting cpu " << pu_num << " to freq  " << cpu_freqs[frequency_idx]
+                  << ", (ret= " << res << ")\n"
+                  << std::flush;
+        #endif
       }
     }
   }
 
-  {
-    // check status of Pus frequency
-    for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
-      unsigned long hardware_freq = 0;
-      std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
-      for (std::size_t j = 0; j < thread_count; j++) {
-        std::size_t pu_num =
-            rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
 
-        if (!cpufreq_cpu_exists(pu_num)) {
-          do {
-            hardware_freq = hardware_reconf::get_hardware_freq(pu_num);
-#ifdef DEBUG_FREQSCALING_
-            std::cout << "current freq on cpu " << pu_num << " is "
-                      //<< hardware_freq << " (target freq is " << cpu_freqs[cpu_freqs.size()-1]
-                      << hardware_freq << " (target freq is " << cpu_freqs[frequency_idx]
-                      << " )\n"
 
-                      << std::flush;
+  // check status of Pus frequency
+  for (std::size_t i = 0; i != thread_pools_.size(); ++i) {
+    unsigned long hardware_freq = 0;
+    std::size_t thread_count = thread_pools_[i]->get_os_thread_count();
+    for (std::size_t j = 0; j < thread_count; j++) {
+      std::size_t pu_num =
+          rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset());
 
-#endif
+      if (!cpufreq_cpu_exists(pu_num)) {
+        do {
+          hardware_freq = hardware_reconf::get_hardware_freq(pu_num);
+          #ifdef DEBUG_FREQSCALING_
+          std::cout << "current freq on cpu " << pu_num << " is "
+                    //<< hardware_freq << " (target freq is " << cpu_freqs[cpu_freqs.size()-1]
+                    << hardware_freq << " (target freq is " << cpu_freqs[frequency_idx]
+                    << " )\n"
 
-          //} while (hardware_freq != cpu_freqs[cpu_freqs.size()-1]);
-          } while (hardware_freq != cpu_freqs[frequency_idx]);
-        }
+                    << std::flush;
+            #endif
+        //} while (hardware_freq != cpu_freqs[cpu_freqs.size()-1]);
+        } while (hardware_freq != cpu_freqs[frequency_idx]);
       }
     }
   }
+  
+}
+#else
+void scheduler::fix_allcores_frequencies(int frequency_idx)
+{
+    // VV: This is a stub
 }
 #endif
 
 #ifdef MEASURE_
-void scheduler::update_active_osthreads(std::size_t delta) {
-  std::size_t temp = active_threads + delta;
-  if (meas_active_threads_max==0)
-    meas_active_threads_max=temp;
+void scheduler::update_active_osthreads(std::size_t threads, int64_t delta_time) {
 
-  if (meas_active_threads_min==0)
-    meas_active_threads_min=temp;
+  if (meas_active_threads_max==0 || meas_active_threads_max < threads)
+    meas_active_threads_max=threads;
 
-  if (meas_active_threads_sum==0){
-    meas_active_threads_count++;
-    meas_active_threads_sum=active_threads;
-    return;
-  }
+  if (meas_active_threads_min==0 || meas_active_threads_min > threads)
+    meas_active_threads_min=threads;
 
-  if ((temp >= min_threads) && (temp <= os_thread_count)){
-    meas_active_threads_count++;
-    meas_active_threads_sum+=temp;
-    if (temp > meas_active_threads_max)
-      meas_active_threads_max=temp;
-    if (temp < meas_active_threads_min)
-      meas_active_threads_min=temp;
-  }
+  meas_active_threads_count += delta_time;
+  meas_active_threads_sum += threads * delta_time;
+
+  std::cout <<"REGISTERING THREADS " << threads << " for " << delta_time << 
+  " current average " << (meas_active_threads_sum/meas_active_threads_count) << std::endl;
 }
 
-void scheduler::update_power_consumption(std::size_t power_sample) {
-  if (meas_power_max==0)
+void scheduler::update_power_consumption(std::size_t power_sample, int64_t delta_time)
+{
+  if ( power_sample > 10000)
+    return;
+  
+  if (meas_power_max==0 || meas_power_max < power_sample)
     meas_power_max=power_sample;
 
-  if (meas_power_min==0)
+  if (meas_power_min==0 || meas_power_min > power_sample)
     meas_power_min=power_sample;
 
-  if (meas_power_sum==0){
-    meas_power_count++;
-    meas_power_sum=power_sample;
-    return;
-  }
 
-  if (power_sample <= 10000){
-    meas_power_count++;
-    meas_power_sum+=power_sample;
-    if (power_sample > meas_power_max)
-      meas_power_max=power_sample;
-    if (power_sample < meas_power_min)
-      meas_power_min=power_sample;
-  }
+  meas_power_count += delta_time;
+  meas_power_sum += power_sample * delta_time;
+
+  std::cout << "Reporting Threads:" << active_threads << " Power:" << power_sample << " for Dt:" << delta_time << std::endl;
 }
 #endif
 
@@ -1494,51 +1395,33 @@ void scheduler::stop() {
       ++pool_idx;
     }
   }
-
-  /*
-
-  if (energy_requested) {
-#if defined(ALLSCALE_HAVE_CPUFREQ)
-
-    for (int cpu_id = 0; cpu_id < topo.num_logical_cores;
-         cpu_id += topo.num_hw_threads) {
-      bool found_it = false;
-      for (std::size_t i = 0; i != thread_pools_.size(); i++) {
-        if (hpx::threads::test(initial_masks_[i], cpu_id))
-          found_it = true;
-      }
-
-      if (!found_it) {
-#ifdef DEBUG_
-        std::cout << " setting cpu_id " << cpu_id << " back online \n"
-                  << std::flush;
-#endif
-
-        hardware_reconf::make_cpus_online(cpu_id, cpu_id + topo.num_hw_threads);
-      }
-    }
-
-    std::string governor = "ondemand";
-    policy.governor = const_cast<char *>(governor.c_str());
-    std::cout << "Set CPU governors back to " << governor << std::endl;
-    for (int cpu_id = 0; cpu_id < topo.num_logical_cores;
-         cpu_id += topo.num_hw_threads)
-      int res = hardware_reconf::set_freq_policy(cpu_id, policy);
-#endif
-  }
-  */
-
   stopped_ = true;
-  //         work_queue_cv_.notify_all();
-  //         std::cout << "rank(" << rank_ << "): scheduled " << count_ << "\n";
-
 
   /* Output all measured metrics */
 #ifdef DEBUG_MULTIOBJECTIVE_
 #ifdef MEASURE_
+  auto timestamp_now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now()).time_since_epoch().count();
+  auto dt_threads = timestamp_now - last_measure_threads;
+  auto dt_power = timestamp_now - last_measure_power;
+
+  last_measure_power = timestamp_now;
+  last_measure_threads = timestamp_now;
+
+  update_active_osthreads(active_threads, dt_threads);
+    allscale::components::monitor *monitor_c = &allscale::monitor::get();
+
+  auto measurement = monitor_c->get_current_power();
+  if ( measurement <= 10000 ) {
+    update_power_consumption(measurement, dt_power);
+  }
+  
+  if ( meas_active_threads_count == 0 )
+    meas_active_threads_count = 1;
+  if ( meas_power_count == 0 )
+    meas_power_count = 1;
+  
   std::cout << "\n****************************************************\n" << std::flush;
   std::cout << "Measured Metrics of Application Execution:\n"
-
             << "\tTotal number of tasks scheduled locally (#taskslocal) = "
             << nr_tasks_scheduled << std::endl
 
@@ -1571,5 +1454,6 @@ void scheduler::stop() {
 #endif
 
 }
-}
-}
+
+} // components
+} // allscale
diff --git a/src/components/util/hardware_reconf.cpp b/src/components/util/hardware_reconf.cpp
index 4cf1491..b515977 100644
--- a/src/components/util/hardware_reconf.cpp
+++ b/src/components/util/hardware_reconf.cpp
@@ -5,6 +5,7 @@
 #include <memory>
 #include <mutex>
 #include <cpufreq.h>
+#include <algorithm>    // std::sort
 
 #include <boost/format.hpp>
 
@@ -25,6 +26,7 @@ namespace allscale { namespace components { namespace util {
         if (available_frequencies != nullptr)
             cpufreq_put_available_frequencies(available_frequencies);
 
+        std::sort(frequencies.begin(), frequencies.end());
         return frequencies;
     }
 
diff --git a/src/dashboard.cpp b/src/dashboard.cpp
index ee326f1..8de511f 100644
--- a/src/dashboard.cpp
+++ b/src/dashboard.cpp
@@ -23,6 +23,9 @@
 #include <boost/asio.hpp>
 
 
+// VV: Define this to use time/energy/resources instead of speed/energy/efficiency
+// #define ALTERNATIVE_SCORE 
+
 namespace allscale { namespace dashboard
 {
     node_state get_state()
@@ -54,17 +57,25 @@ namespace allscale { namespace dashboard
         state.max_frequency = monitor_c->get_max_freq(0);
 
         std::size_t active_cores = scheduler::get().get_active_threads();
-
+        state.last_local_score = scheduler::get().get_last_objective_score();
         state.productive_cycles_per_second = float(state.cur_frequency) * (1.f - state.idle_rate);  // freq to Hz
 
+#if defined(ALTERNATIVE_SCORE)
+        state.speed = monitor_c->get_avg_time_last_iterations(100);
+        state.efficiency = active_cores;
+#else
         state.speed = 1.f - state.idle_rate;
         state.efficiency = state.speed * (float(state.cur_frequency * active_cores) / float(state.max_frequency * state.num_cores));
+#endif
 
-#ifdef POWER_ESTIMATE
+#if defined(POWER_ESTIMATE) || defined(ALLSCALE_HAVE_CPUFREQ)
         state.cur_power = monitor_c->get_current_power();
         state.max_power = monitor_c->get_max_power();
-        state.power = state.cur_power / state.max_power;
+#else
+        state.max_power = 1.0;
+        state.cur_power = 1.0;
 #endif
+        state.power = state.cur_power / state.max_power;
 
         return state;
     }
@@ -99,6 +110,7 @@ namespace allscale { namespace dashboard
         ar & speed;
         ar & efficiency;
         ar & power;
+        ar & last_local_score;
     }
 
     std::string node_state::to_json() const
@@ -164,9 +176,15 @@ namespace allscale { namespace dashboard
 
     float system_state::score() const
     {
+#if defined(ALTERNATIVE_SCORE)
+        return std::exp(speed * speed_exponent) *
+                std::exp(efficiency * efficiency_exponent ) *
+                std::exp(power * power_exponent);
+#else
         return std::pow(speed, speed_exponent) *
                std::pow(efficiency, efficiency_exponent) *
                std::pow(1 - power, power_exponent);
+#endif
     }
 
     template void node_state::serialize<hpx::serialization::input_archive>(hpx::serialization::input_archive& ar, unsigned);
@@ -208,7 +226,7 @@ namespace allscale { namespace dashboard
 
             const char* host_env = std::getenv(ENVVAR_DASHBOARD_IP);
             const char* port_env = std::getenv(ENVVAR_DASHBOARD_PORT);
-
+            
             std::string host;
             if (host_env)
             {
@@ -298,11 +316,11 @@ namespace allscale { namespace dashboard
             buffers[0] = boost::asio::buffer(&m->msg_size, sizeof(std::uint64_t));
             buffers[1] = boost::asio::buffer(m->json.data(), m->json.length());
 
-/*
+            /*
              std::cout << "Sending -----------------------------------\n";
              std::cout << m->json << '\n';
              std::cout << "Sending done ------------------------------\n";
-*/
+            */
             boost::asio::async_write(socket_, buffers,
                 [f = std::move(f), m](boost::system::error_code ec, std::size_t /*length*/)
                 {
@@ -431,6 +449,7 @@ namespace allscale { namespace dashboard
         std::vector<hpx::id_type> localities_;
         std::uint64_t time = 0;
         bool enabled_;
+        double use_gopt, use_lopt;
     };
 
     dashboard_client& dashboard_client::get()
@@ -490,13 +509,18 @@ namespace allscale { namespace dashboard
                         total_efficiency += cur.efficiency;
                         cur_power += cur.cur_power;
                     }
+
                     max_power += cur.max_power;
                 }
 
                 state.speed = total_speed / client.localities_.size();
 //                 state.speed = std::pow(total_speed, 1.f/client.localities_.size());
-
+#if defined(ALTERNATIVE_SCORE)
+                // VV: This is the number of active threads
+                state.efficiency = total_efficiency;
+#else
                 state.efficiency = total_efficiency / client.localities_.size();
+#endif
                 state.power = (max_power > 0) ? cur_power/max_power : 0;
 
                 auto exponents = scheduler::get_optimizer_exponents();
diff --git a/src/optimizer.cpp b/src/optimizer.cpp
index 19731e8..389aa5e 100644
--- a/src/optimizer.cpp
+++ b/src/optimizer.cpp
@@ -17,6 +17,7 @@
 #include <cmath>
 #include <iostream>
 #include <iomanip>
+#include <map>
 
 #include <sys/types.h>
 #include <unistd.h>
@@ -25,11 +26,20 @@
 
 #define TRULY_RANDOM_DEBUG
 
+#define DEBUG_NMD_INO 1
+
+#ifdef DEBUG_NMD_INO
+#define OUT_DEBUG(X) X
+#else
+#define OUT_DEBUG(X) \
+    {                \
+    }
+#endif
+
 namespace allscale
 {
     optimizer_state get_optimizer_state()
     {
-        static float last_energy = 0.f;
         float load = 1.f - monitor::get().get_idle_rate();
         float my_time = monitor::get().get_avg_time_last_iterations(HISTORY_ITERATIONS);
 
@@ -37,18 +47,19 @@ namespace allscale
             my_time = -1.f;
 
         allscale::components::monitor *monitor_c = &allscale::monitor::get();
-        float energy = 100.f;
-#ifdef POWER_ESTIMATE
-        energy = monitor_c->get_current_power();
+        float power_now = 0.001f;
+#if defined(POWER_ESTIMATE) || defined(ALLSCALE_HAVE_CPUFREQ)
+        power_now = monitor_c->get_current_power() / monitor_c->get_max_power();
 #endif
-
+        // VV: Use power as if it were energy
         return {
             load,
             monitor::get().get_task_times(),
             my_time,
-            energy,
+            power_now,
             float(monitor_c->get_current_freq(0)),
-            scheduler::get().get_active_threads()
+            scheduler::get().get_active_threads(),
+            scheduler::get().get_total_threads()
         };
     }
 // optimizer_state get_optimizer_state()
@@ -89,11 +100,15 @@ namespace allscale
         scheduler::apply_new_mapping(new_mapping);
     }
 
+    void optimizer_update_max_threads(std::size_t max_threads) {
+        scheduler::update_max_threads(max_threads);
+    }
 } // namespace allscale
 
 HPX_PLAIN_DIRECT_ACTION(allscale::get_optimizer_state, allscale_get_optimizer_state_action);
 HPX_PLAIN_DIRECT_ACTION(allscale::optimizer_update_policy, allscale_optimizer_update_policy_action);
 HPX_PLAIN_DIRECT_ACTION(allscale::optimizer_update_policy_ino, allscale_optimizer_update_policy_action_ino);
+HPX_PLAIN_DIRECT_ACTION(allscale::optimizer_update_max_threads, allscale_optimizer_update_max_threads);
 
 namespace allscale
 {
@@ -127,6 +142,19 @@ tuning_objective get_default_objective()
         return tuning_objective::efficiency();
     if (obj == "power")
         return tuning_objective::power();
+    if ( obj == "local") {
+        double time_weight, energy_weight, resource_weight;
+        
+        auto &&local_scheduler = scheduler::get();
+
+        local_scheduler.get_local_optimizer_weights(&time_weight,
+                                                    &energy_weight,
+                                                    &resource_weight);
+        // VV: If the local-optimizer is used too then copy its objectives
+        return tuning_objective(time_weight, 
+                                resource_weight, 
+                                energy_weight);
+    }
 
     float speed = 0.0f;
     float efficiency = 0.0f;
@@ -170,14 +198,28 @@ float estimate_power(float frequency)
 
 global_optimizer::global_optimizer()
     : u_balance_every(10), u_steps_till_rebalance(u_balance_every),
-    active_nodes_(allscale::get_num_localities(), true), tuner_(new simple_coordinate_descent(tuner_configuration{active_nodes_, allscale::monitor::get().get_current_freq(0)})),
+    active_nodes_(allscale::get_num_localities(), true),
     objective_(get_default_objective()),
     active_(true), localities_(hpx::find_all_localities()),
-    f_resource_max(-1.0f), f_resource_leeway(-1.0f)
+    f_resource_max(-1.0f), f_resource_leeway(-1.0f), 
+    nmd(0.005),
+    nmd_initialized(0),
+    nodes_min(1), nodes_max(localities_.size()), threads_min(0), threads_max(0),
+    last_optimization_score(1.0)
 {
     char *const c_policy = std::getenv("ALLSCALE_SCHEDULING_POLICY");
-
-    if (c_policy && strncasecmp(c_policy, "ino", 3) == 0 )
+    char *const c_tuner = std::getenv("ALLSCALE_TUNER");
+
+    std::string input_objective_str =
+      hpx::get_config_entry("allscale.objective", "");
+    
+    if ( input_objective_str == "allscale" )
+        use_lopt = true;
+    else
+        use_lopt = false;
+    previous_num_nodes = localities_.size();
+
+    if (c_policy && strcasecmp(c_policy, "ino") == 0 )
     {
         char *const c_resource_max = std::getenv("ALLSCALE_RESOURCE_MAX");
         char *const c_resource_leeway = std::getenv("ALLSCALE_RESOURCE_LEEWAY");
@@ -195,20 +237,64 @@ global_optimizer::global_optimizer()
             f_resource_max = 0.75f;
         else
             f_resource_max = atof(c_resource_max);
+        
+        nodes_min = f_resource_leeway * localities_.size();
+    }
 
+    nodes_max = localities_.size();
+
+    if ( nodes_min < 1 )
+        nodes_min = 1;
+
+    if ( c_policy && strcasecmp(c_policy, "ino"))
         o_ino = allscale::components::internode_optimizer_t(localities_.size(),
                                                             (double) f_resource_max,
                                                             (double) f_resource_leeway,
                                                             INO_DEFAULT_FORGET_AFTER);
+    
+    if ( c_policy && strcasecmp(c_policy, "ino_nmd")) {       
+        char *const c_threads_min = std::getenv("ALLSCALE_GINO_THREADS_MIN");
+        char *const c_threads_max = std::getenv("ALLSCALE_GINO_THREADS_MAX");
+        
+        if ( c_threads_min )
+            threads_min = atoi(c_threads_min);
+        
+        if ( c_threads_max )
+            threads_max = atoi(c_threads_max);
     }
-//     else if ( strncasecmp(c_policy, "truly_random", 12) == 0 ) {
-//         char *const c_balance_every = std::getenv("ALLSCALE_TRULY_RANDOM_BALANCE_EVERY");
-//
-//         if ( c_balance_every ) {
-//             u_balance_every = (std::size_t) atoi(c_balance_every);
-//             u_steps_till_rebalance = u_balance_every;
-//         }
-//     }
+
+    // VV: Guestimate that max iter time is 500 ms (will be refined over time)
+    objectives_scale[0] = 0.5;
+    objectives_scale[1] = 1.0;
+    objectives_scale[2] = 1.0;
+
+    if (c_policy && strcasecmp(c_policy, "neldermead")) {
+        std::cout << "Choosing NelderMead Optimizer for global optimization" << std::endl;
+        tuner_ = std::make_unique<nmd_optimizer>(nodes_min, nodes_max);
+    }
+    else {
+        std::cout << "Choosing Coordinate Descent Optimizer for global optimization" << std::endl;
+        tuner_ = std::make_unique<simple_coordinate_descent>(tuner_configuration{active_nodes_, allscale::monitor::get().get_current_freq(0)});
+    }
+}
+
+double global_optimizer::get_optimization_score()
+{
+    return last_optimization_score;
+}
+
+void global_optimizer::signal_objective_changed()
+{
+    const double new_weights[3] = {
+        objective_.speed_exponent,
+        objective_.power_exponent,
+        objective_.efficiency_exponent
+    };
+
+    nmd.set_weights(new_weights);
+
+    if ( nmd_initialized )
+        nmd_initialized = 0;
 }
 
 void global_optimizer::tune(std::vector<optimizer_state> const &state)
@@ -234,7 +320,7 @@ void global_optimizer::tune(std::vector<optimizer_state> const &state)
             total_efficiency += state[i].load_ * (float(state[i].active_frequency_ * state[i].cores_per_node_) / float(max_frequency * state[i].cores_per_node_));;
             used_power += state[i].energy_;
         }
-#ifdef POWER_ESTIMATE
+#if defined(POWER_ESTIMATE) || defined(ALLSCALE_HAVE_CPUFREQ)
         max_power += monitor_c->get_max_power();
 #endif
     }
@@ -364,7 +450,7 @@ hpx::future<void> global_optimizer::decide_random_mapping(const std::vector<std:
         #ifdef TRULY_RANDOM_DEBUG
         std::cerr << "Will exclude " << how_many_to_exclude << " out of " << num_localities << std::endl;
         #endif
-
+        #if 1
         for (auto i=0ul; i<how_many_to_exclude; ++i) {
             auto new_exclude = get_random_node();
             exclude.push_back(new_exclude);
@@ -373,6 +459,15 @@ hpx::future<void> global_optimizer::decide_random_mapping(const std::vector<std:
             std::cerr << "Excluded: " << new_exclude << std::endl;
             #endif
         }
+        #else
+        for ( auto i=num_localities-how_many_to_exclude; i<num_localities; ++i) {
+            exclude.push_back(num_localities-i-1);
+
+            #ifdef TRULY_RANDOM_DEBUG
+            std::cerr << "Excluded: " << i << std::endl;
+            #endif
+        }
+        #endif
     }
 
     u_steps_till_rebalance = u_balance_every;
@@ -398,6 +493,323 @@ hpx::future<void> global_optimizer::decide_random_mapping(const std::vector<std:
         );
 }
 
+hpx::future<void> global_optimizer::balance_ino_nmd(const std::vector<std::size_t> &old_mapping)
+{
+    u_steps_till_rebalance = u_balance_every;
+    return hpx::lcos::broadcast<allscale_get_optimizer_state_action>(localities_)
+        .then(
+            [this, old_mapping](hpx::future<std::vector<optimizer_state> > future_state) {
+                std::lock_guard<mutex_type> l(mtx_);
+                std::size_t num_active_nodes = std::count(active_nodes_.begin(),                                active_nodes_.end(), true);
+                
+                auto state = future_state.get();
+                float avg_time = 0;
+                float avg_energy = 0;
+                float avg_threads = 0;
+                int from_node = 0;
+
+                std::size_t num_avg_time = 0ul;
+
+                for (const auto &s:state) {
+                    // VV: Only keep track of nodes that were selected by last step
+                    if ( from_node++ == previous_num_nodes )
+                        break;
+
+                    if ( s.avg_time_ > 0.0) {
+                        avg_time += s.avg_time_;
+                        num_avg_time ++;
+                    }
+
+                    avg_energy += s.energy_;
+                    avg_threads += s.active_cores_per_node_ / (float) s.cores_per_node_;
+
+                    ++from_node;
+                }
+                
+                if ( num_avg_time )
+                    avg_time /= num_avg_time;
+                else
+                    avg_time = 0.0;
+
+                avg_energy /= num_active_nodes;
+                avg_threads /= num_active_nodes;
+
+                // VV: First record current state
+                double measurements[3] = {avg_time, 
+                                          avg_energy, 
+                                          avg_threads};
+                
+                if ( objectives_scale[0] < avg_time ) {
+                    objectives_scale[0] = avg_time * 2.0;
+                    nmd.set_scale(objectives_scale);
+                }
+
+                if ( nmd_initialized == 0 ) {
+                    double weights[] = {(double) objective_.speed_exponent, 
+                                        (double) objective_.efficiency_exponent,
+                                        (double) objective_.power_exponent};
+                    const double constraint_min[] = {(double) nodes_min, 
+                                                      (double) threads_min};
+                    const double constraint_max[] = {(double) nodes_max, 
+                                                    (double) threads_max};
+                    nmd.set_scale(objectives_scale);
+
+                    nmd.initialize_simplex(weights, 
+                                            nullptr,
+                                            constraint_min,
+                                            constraint_max);
+                    
+                    nmd_initialized = 1;
+                }
+
+                auto action = nmd.step(measurements, 
+                                        previous_num_nodes,
+                                        avg_threads * previous_num_nodes);
+                
+                last_optimization_score = nmd.evaluate_score(measurements, nullptr);
+
+                // VV: Todo do something with the action
+                //     assume that .threads = nodes and .freq_idx = threads per node
+                int new_num_nodes = action.threads;
+                int new_threads_per_node = action.freq_idx;
+
+                if ( new_num_nodes != previous_num_nodes ) {
+                    // VV: Need to redistribute tasks to nodes.
+                    //     Try to move as few as possible tasks
+                    /* VV: Balancing algorithm:
+                        new_avg_tasks = ceil(total_tasks / new_num_nodes)
+                        node_to_tasks{} = find out which tasks each node is computing()
+                        
+                        if ( new_num_nodes < previous_nodes ) {
+                            // VV: Evenly distribute all now orphaned tasks to remaining nodes
+                            orphaned_tasks = those which were running on the now unused nodes
+                            for ( node:new_used_nodes ) {
+                                old_tasks = size(node_to_tasks[node])
+                                added_to_node = 0;
+                                while (remaining_orphaned 
+                                        && added_to_node < new_avg_tasks-old_tasks) {
+                                    orphan = orphaned.pop()
+                                    node.tasks.push_back(orphan)
+                                    added_to_node ++;
+                                }
+                            }
+                        } else if ( new_num_nodes > previous_node ) {
+                            num_need_to_move = new_avg_tasks;
+                            node_to_move = previous_nodes;
+
+                            // VV: Redistribute last tasks from overflowed nodes to new ones
+                            while ( num_need_to_move > 0 && node_to_move < new_num_nodes ) {
+                                for ( node:new_used_nodes ) {
+                                    if ( num_need_to_move == 0 ) {
+                                        if ( node_to_move < new_num_nodes) {
+                                            node_to_move ++;
+                                            num_need_to_move = new_avg_tasks;
+                                        } else {
+                                            break;
+                                        }
+                                    }
+
+                                    task = node.tasks[-1]
+                                    node_to_tasks[node_to_move].tasks.push_back(task)
+                                    num_need_to_move --
+                                }
+                            }
+                        }
+                    */
+                    // VV: Some of the nodes might be dead, convert the virtual name
+                    //     to the physical name
+                    auto virtual_to_physical = std::vector<std::size_t>();
+
+                    std::size_t cur_node = 0ul;
+
+                    for (const auto &physical:active_nodes_) {
+                        if ( physical ) {
+                            OUT_DEBUG(
+                                std::cout << "[Ino_NMD] Node " << cur_node << " is was used last time" << std::endl;
+                            )
+                        } else {
+                            OUT_DEBUG(
+                                std::cout << "[Ino_NMD] Node " << cur_node << " was not used last time" << std::endl;
+                            )
+                        }
+
+                        virtual_to_physical.push_back(cur_node);
+                        cur_node ++;
+                    }
+                    num_active_nodes = active_nodes_.size();
+                    
+                    if ( new_num_nodes > num_active_nodes )
+                        new_num_nodes = num_active_nodes;
+                    
+                    if ( previous_num_nodes > num_active_nodes )
+                        previous_num_nodes = num_active_nodes;
+                    
+                    auto new_avg_tasks = (std::size_t) std::ceil(old_mapping.size()/
+                                                                 (float)new_num_nodes);
+                    auto new_mapping = std::vector<std::size_t>(old_mapping.size(), 0ul);
+                    auto node_to_tasks = std::map<std::size_t, std::vector<std::size_t> >();
+                    // VV: node_to_tasks maps node id to list of tasks that it's running
+                    std::size_t task_id = 0;
+                    
+
+                    for (auto i=0ul; i<num_active_nodes; ++i)
+                        node_to_tasks.insert(std::make_pair(i, std::vector<std::size_t>()));
+
+                    for ( const auto &node_id:old_mapping )
+                        node_to_tasks[node_id].push_back(task_id++);
+
+                    OUT_DEBUG(
+                        std::cout << "[Ino_NMD] Rebalancing (original):" << std::endl;
+
+                        for ( const auto &node: node_to_tasks ) {
+                            std::cout << "node " << node.first << ": ";
+                            for ( const auto &task:node.second)
+                                std::cout << " " << task;
+                            std::cout << std::endl;
+                        }
+                    )
+
+                    // VV: Something else is setting the scheduling policy too
+                    //     try to redistribute tasks to all @previous_num_nodes
+                    OUT_DEBUG(
+                        std::cout << "[GLOBAL OPTIMIZER] Re-balancing previous nodes" << std::endl;
+                    )
+
+                    auto prev_avg_tasks =
+                    (std::size_t) std::ceil(old_mapping.size() /
+                                            (float)previous_num_nodes);
+                    auto node_fewer_tasks = 1ul;
+
+                    for (auto node_id = 0ul; node_id < num_active_nodes; ++node_id)
+                    {
+                        auto &node = node_to_tasks[node_id];
+                        while (node.size() > prev_avg_tasks)
+                        {
+                            while (node_to_tasks[node_fewer_tasks].size() >= prev_avg_tasks)
+                                if (++node_fewer_tasks == previous_num_nodes)
+                                    break;
+
+                            if (node_fewer_tasks == previous_num_nodes)
+                                break;
+
+                            auto task = node.back();
+                            node.pop_back();
+                            node_to_tasks[node_fewer_tasks].push_back(task);
+                        }
+                    }
+
+                    OUT_DEBUG(
+                        std::cout << "[GLOBAL OPTIMIZER] Rebalanced (still original):" << std::endl;
+
+                        for ( const auto &node: node_to_tasks ) {
+                            std::cout << "node " << node.first << ": ";
+                            for ( const auto &task:node.second)
+                                std::cout << " " << task;
+                            std::cout << std::endl;
+                        }
+
+
+                        std::cout << "[GLOBAL OPTIMIZER] Changing nodes from "
+                                << previous_num_nodes
+                                << " to " << new_num_nodes << std::endl;
+                    )
+
+                    if (new_num_nodes < previous_num_nodes)
+                    {
+                        OUT_DEBUG(
+                            std::cout << "[GLOBAL OPTIMIZER] Decreasing nodes" << std::endl;
+                        )
+                        auto lost_node = new_num_nodes;
+
+                        while (lost_node < previous_num_nodes && node_to_tasks[lost_node].size())
+                        {
+                            for (auto node_id = 0ul; node_id < new_num_nodes; ++node_id)
+                            {
+                                auto &node = node_to_tasks[node_id];
+                                auto old_tasks = node.size();
+                                for (auto new_tasks = old_tasks;
+                                     lost_node < previous_num_nodes && new_tasks < new_avg_tasks;
+                                     new_tasks++)
+                                {
+                                    // VV: Move next orphaned task to @node
+                                    while (node_to_tasks[lost_node].size() == 0)
+                                    {
+                                        if (++lost_node == previous_num_nodes)
+                                            break;
+                                    }
+
+                                    if (lost_node == previous_num_nodes)
+                                        break;
+
+                                    std::size_t task = node_to_tasks[lost_node].back();
+                                    node_to_tasks[lost_node].pop_back();
+                                    node.push_back(task);
+                                }
+                            }
+                        }
+                    }
+                    else if (new_num_nodes > previous_num_nodes)
+                    {
+                        OUT_DEBUG(
+                            std::cout << "[GLOBAL OPTIMIZER] Increasing nodes" << std::endl;
+                        )
+                        auto new_node = previous_num_nodes - 1;
+                        for (auto node_id = 0ul; node_id < previous_num_nodes; ++node_id)
+                        {
+                            auto &node = node_to_tasks[node_id];
+                            while (node.size() > new_avg_tasks)
+                            {
+                                while (node_to_tasks[new_node].size() >= new_avg_tasks)
+                                    if (++new_node == new_num_nodes)
+                                        break;
+
+                                if (new_node == new_num_nodes)
+                                    break;
+
+                                auto task = node.back();
+                                node.pop_back();
+                                node_to_tasks[new_node].push_back(task);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        OUT_DEBUG(
+                            std::cout << "[GLOBAL OPTIMIZER] Did not modify mapping" << std::endl;
+                        )
+                    }
+
+                    if (previous_num_nodes != new_num_nodes ) {
+                        OUT_DEBUG(
+                            std::cout << "[GLOBAL OPTIMIZER] Rebalancing (NEW):" << std::endl;
+
+                            for ( const auto &node: node_to_tasks ) {
+                                std::cout << "node " << node.first << ": ";
+                                for ( const auto &task:node.second)
+                                    std::cout << " " << task;
+                                std::cout << std::endl;
+                            }
+                        )
+
+                        for (auto i = 0ul;  i< new_mapping.size(); ++i)
+                            new_mapping[i] = virtual_to_physical[new_mapping[i]];
+
+                        previous_num_nodes = new_num_nodes;
+                        hpx::lcos::broadcast_apply<allscale_optimizer_update_policy_action_ino>(localities_, new_mapping);
+
+                        for (auto i=0u; i<new_num_nodes; ++i ) {
+                            active_nodes_[i] = true;
+                        }
+                        for ( auto i=new_num_nodes ;i<active_nodes_.size(); ++i)
+                            active_nodes_[i] = false;
+                    }
+
+                    if ( threads_min != threads_max )
+                        hpx::lcos::broadcast_apply<allscale_optimizer_update_max_threads>(localities_, new_threads_per_node);
+                }
+            });
+}
+
 hpx::future<void> global_optimizer::balance_ino(const std::vector<std::size_t> &old_mapping)
 {
     /*VV: Compute the new ino_knobs (i.e. number of Nodes), then assign tasks to
@@ -475,6 +887,7 @@ hpx::future<void> global_optimizer::balance_ino(const std::vector<std::size_t> &
                     #ifdef INO_DEBUG_DECIDE_SCHEDULE
                     std::cerr << "Ino picked a schedule" << std::endl;
                     #endif
+
                     for (auto node_wis : ino_schedule)
                         for (auto wi : node_wis.second.v_work_items)
                             new_mapping[wi] = node_wis.first;
diff --git a/src/scheduler.cpp b/src/scheduler.cpp
index 95367c6..05a7479 100644
--- a/src/scheduler.cpp
+++ b/src/scheduler.cpp
@@ -92,18 +92,9 @@ namespace allscale
                     obj = objective_str.substr(0, idx);
                     leeway = std::stod( objective_str.substr(idx + 1) );
                 }
-
-                if (obj == "time")
-                {
-                    enable_elasticity = true;
-                    break;
-                }
-                else if (obj == "resource")
-                {
-                    enable_elasticity = true;
-                    break;
-                }
             }
+            
+            enable_elasticity = true;
         }
 
         rp.set_default_pool_name("allscale-numa-0");
@@ -175,6 +166,7 @@ namespace allscale
                  * ALLSCALE_RESOURCE_LEEWAY = (0.0, 1.0) // extra percentage allowed to explore
                  */
                 ino,
+                ino_nmd,
                 random,
                 truly_random
             };
@@ -194,8 +186,12 @@ namespace allscale
                         return "tuned";
                     case ino:
                         return "ino";
+                    case ino_nmd:
+                        return "ino_nmd";
                     case random:
                         return "random";
+                    case truly_random:
+                        return "truly_random";
                     default:
                         return "unknown";
                 }
@@ -232,6 +228,19 @@ namespace allscale
                     tree_scheduling_policy::create_uniform(allscale::get_num_localities())
                 };
             }
+            if (policy == "ino_nmd" ) {
+                return {
+                    replacable_policy::ino_nmd,
+                    tree_scheduling_policy::create_uniform(allscale::get_num_localities())
+                };
+            }
+            if (policy == "truly_random")
+            {
+                return {
+                    replacable_policy::truly_random,
+                    tree_scheduling_policy::create_uniform(allscale::get_num_localities())
+                };
+            }
             if (policy == "random")
             {
                 return {
@@ -286,6 +295,8 @@ namespace allscale
           , right_id_(std::move(other.right_id_))
           , is_root_(other.is_root_)
           , optimizer_(std::move(other.optimizer_))
+          , use_gopt(other.use_gopt)
+          , use_lopt(other.use_lopt)
         {
             HPX_ASSERT(false);
         }
@@ -298,6 +309,20 @@ namespace allscale
           , parent_(here_.getParent())
           , is_root_(here_ == root_)
         {
+            char *const c_policy = std::getenv("ALLSCALE_SCHEDULING_POLICY");
+            std::string input_objective_str = hpx::get_config_entry("allscale.objective", "");
+
+            if (c_policy && strcasecmp(c_policy, "ino") == 0 )
+                use_gopt = true;
+            else
+                use_gopt = false;
+            
+            if ( input_objective_str == "allscale" )
+                use_lopt = true;
+            else
+                use_lopt = false;
+
+
             if (parent_.getRank() != scheduler::rank())
             {
                 parent_id_ = hpx::naming::get_id_from_locality_id(
@@ -325,7 +350,7 @@ namespace allscale
 
             if (is_root_) run();
         }
-
+        
         std::string policy()
         {
             return policy_.policy();
@@ -334,8 +359,13 @@ namespace allscale
         void apply_new_mapping(const std::vector<std::size_t> &new_mapping)
         {
             std::lock_guard<mutex_type> l(mtx_);
-            policy_.policy_ = tree_scheduling_policy::from_mapping(*policy_.policy_,
-                                                                    new_mapping);
+            policy_.policy_ = 
+                tree_scheduling_policy::from_mapping(*policy_.policy_, new_mapping);
+        }
+
+        void update_max_threads(std::size_t max_threads) {
+            auto &&local_scheduler = scheduler::get();
+            local_scheduler.update_max_threads(max_threads);
         }
 
         void toggle_node(std::size_t locality_id)
@@ -357,22 +387,74 @@ namespace allscale
             }
         }
 
+        double get_local_objective() {
+            auto &&local_scheduler = scheduler::get();
+            return local_scheduler.get_last_objective_score();
+        }
+
+        double get_last_objective_score()
+        {
+            auto &&local_scheduler = scheduler::get();
+            return local_scheduler.get_last_objective_score();    
+        }
+
         void set_speed_exponent(float exp)
         {
             std::lock_guard<mutex_type> l(optimizer_.mtx_);
             optimizer_.objective_.speed_exponent = exp;
+            optimizer_.signal_objective_changed();
+
+            double time_weight, energy_weight, resource_weight;
+
+            auto &&local_scheduler = scheduler::get();
+
+            local_scheduler.get_local_optimizer_weights(&time_weight,
+                                                        &energy_weight,
+                                                        &resource_weight);
+            time_weight = (double) exp;
+
+            local_scheduler.set_local_optimizer_weights(time_weight,
+                                                        energy_weight,
+                                                        resource_weight);
         }
 
         void set_efficiency_exponent(float exp)
         {
             std::lock_guard<mutex_type> l(optimizer_.mtx_);
             optimizer_.objective_.efficiency_exponent = exp;
+            optimizer_.signal_objective_changed();
+
+            double time_weight, energy_weight, resource_weight;
+
+            auto &&local_scheduler = scheduler::get();
+
+            local_scheduler.get_local_optimizer_weights(&time_weight,
+                                                        &energy_weight,
+                                                        &resource_weight);
+            resource_weight = (double) exp;
+
+            local_scheduler.set_local_optimizer_weights(time_weight,
+                                                        energy_weight,
+                                                        resource_weight);
         }
 
         void set_power_exponent(float exp)
         {
             std::lock_guard<mutex_type> l(optimizer_.mtx_);
             optimizer_.objective_.power_exponent = exp;
+            optimizer_.signal_objective_changed();
+            double time_weight, energy_weight, resource_weight;
+            
+            auto &&local_scheduler = scheduler::get();
+
+            local_scheduler.get_local_optimizer_weights(&time_weight,
+                                                        &energy_weight,
+                                                        &resource_weight);
+            energy_weight = (double) exp;
+
+            local_scheduler.set_local_optimizer_weights(time_weight,
+                                                        energy_weight,
+                                                        resource_weight);
         }
 
         hpx::util::tuple<float, float, float> get_optimizer_exponents()
@@ -385,6 +467,7 @@ namespace allscale
             );
         }
 
+        bool use_gopt, use_lopt;
 
         void set_policy(std::string policy)
         {
@@ -448,6 +531,16 @@ namespace allscale
                 tree_scheduling_policy const& old = static_cast<tree_scheduling_policy const&>(*policy_.policy_);
                 optimizer_.balance_ino(old.task_distribution_mapping());
             }
+            
+            if ( policy_.value_ == replacable_policy::ino_nmd) {
+                tree_scheduling_policy const& old = static_cast<tree_scheduling_policy const&>(*policy_.policy_);
+                optimizer_.balance_ino_nmd(old.task_distribution_mapping());
+            }
+
+            if (policy_.value_ == replacable_policy::truly_random) {
+                tree_scheduling_policy const& old = static_cast<tree_scheduling_policy const&>(*policy_.policy_);
+                optimizer_.decide_random_mapping(old.task_distribution_mapping());
+            }
 
             return true;
         }
@@ -462,7 +555,7 @@ namespace allscale
 
         void schedule(work_item work)
         {
-            if (is_root_ && work.id().is_root() && work.id().id % 20 == 0)
+            if (is_root_ && work.id().is_root() && work.id().id % 5 == 0)
             {
                 balance();
             }
@@ -667,6 +760,23 @@ namespace allscale
         );
     }
 
+    double get_last_objective_score()
+    {
+        std::vector<double> scores;
+
+        runtime::HierarchicalOverlayNetwork::forAllLocal<scheduler_service>(
+            [&](scheduler_service& sched)
+            {
+                scores.push_back(sched.get_last_objective_score());
+            }
+        );
+
+        std::cout << "GET_LAST_OBJETIVE_SCORE (SCHED): got " << scores.size() << " values" << std::endl;
+        for (const auto &score: scores ) {
+            std::cout << score  << std::endl;
+        }
+    }
+
     void set_efficiency_exponent_broadcast(float exp)
     {
         runtime::HierarchicalOverlayNetwork::forAllLocal<scheduler_service>(
@@ -747,6 +857,16 @@ namespace allscale
         monitor::get().set_cur_freq(freq);
     }
 
+    void scheduler::update_max_threads(std::size_t max_threads)
+    {
+        runtime::HierarchicalOverlayNetwork::forAllLocal<scheduler_service>(
+            [&](scheduler_service& sched)
+            {
+                sched.update_max_threads(max_threads);
+            }
+        );
+    }
+
     void scheduler::apply_new_mapping(const std::vector<std::size_t> &new_mapping)
     {
         runtime::HierarchicalOverlayNetwork::forAllLocal<scheduler_service>(
diff --git a/src/tuner.cpp b/src/tuner.cpp
index 546a2be..2203687 100644
--- a/src/tuner.cpp
+++ b/src/tuner.cpp
@@ -4,6 +4,8 @@
 #include <allscale/components/monitor.hpp>
 #include <allscale/utils/printer/vectors.h>
 #include <allscale/utils/optional.h>
+#include <algorithm>
+
 
 namespace allscale {
     std::ostream& operator<<(std::ostream& os, tuner_configuration const& cfg)
@@ -204,4 +206,104 @@ namespace allscale {
         // print a status message
         std::cerr << "New search direction: " << (dim == num_nodes ? "#nodes" : "frequency") << " " << (dir == up ? "up" : "down") << "\n";
     }
+
+    nmd_optimizer::nmd_optimizer(std::size_t nodes_min, 
+                                 std::size_t nodes_max)
+    : nmd(2, 3, 0.01, 2000, 50ul)
+    , converged(false)
+    {
+        constraint_min[0] = nodes_min;
+        constraint_max[0] = nodes_max;
+
+        avail_freqs = monitor::get().get_available_freqs(0);
+        std::sort(avail_freqs.begin(), avail_freqs.end());
+
+        if ( avail_freqs.size() ) {
+            constraint_min[1] = 0;
+            constraint_max[1] = avail_freqs.size() - 1;
+        } else {
+            constraint_min[1] = 0;
+            constraint_max[1] = 0;
+        }
+
+        previous_weights[0] = 0;
+        previous_weights[1] = 0;
+        previous_weights[2] = 0;
+    }
+
+    tuner_configuration nmd_optimizer::next(tuner_configuration const& current_cfg, tuner_state const& current_state, tuning_objective obj)
+    {
+        tuner_configuration res;
+        auto action = std::vector<std::size_t>();
+
+        const double weights[] = {
+                obj.speed_exponent, obj.efficiency_exponent, obj.power_exponent
+        };
+
+        double diff = 0.0;
+
+        for (auto i=0ul; i<3; ++i)
+            diff += abs(previous_weights[i] - weights[i]);
+
+        if ( diff > 0.01 ) {
+            // VV: Enforce exploration
+            initialized = false;
+            this->converged = false;
+        }
+
+        for (auto i=0ul; i<3; ++i)
+            previous_weights[i] = weights[i];
+
+        if ( initialized == false ){
+            nmd.initialize(constraint_min, 
+                            constraint_max, 
+                            nullptr, 
+                            weights,
+                            &nmd.score_speed_efficiency_power);
+            initialized = true;
+        }
+        
+        if ( this->converged == false ) {
+            double measurements[3] = {current_state.speed, current_state.efficiency, current_state.power};
+
+            std::size_t num_active_nodes = std::count(current_cfg.node_mask.begin(),
+                        current_cfg.node_mask.end(), 
+                        true);
+            std::size_t freq_idx;
+            auto e = std::find(avail_freqs.begin(), avail_freqs.end(), current_cfg.frequency);
+
+            if ( e == avail_freqs.end() )
+                freq_idx = 0;
+            else
+                freq_idx = e - avail_freqs.begin();
+
+            const std::size_t observed[] = {num_active_nodes, freq_idx};
+            auto ret = nmd.get_next(measurements, observed);
+            action.assign(ret.first.begin(), ret.first.end());
+            auto converged = ret.second;
+
+            if (converged) {
+                best.assign(action.begin(), action.end());
+                this->converged = true;
+            }
+        } else {
+            action.assign(best.begin(), best.end());
+        }
+
+        res.node_mask.assign(current_cfg.node_mask.begin(), 
+                            current_cfg.node_mask.end());
+        
+        for (auto i=0ul; i<action[0]; ++i)
+                res.node_mask[i] = true;
+        for (auto i=action[0]; i<res.node_mask.size(); ++i)
+            res.node_mask[i] = false;
+        
+        res.frequency = action[1];
+
+        if ( avail_freqs.size() ) {
+            res.frequency = avail_freqs[action[1]];
+        }
+
+        return res;
+    }
 }