diff --git a/allscale/components/localoptimizer.hpp b/allscale/components/localoptimizer.hpp index c0e588a..9e11ebb 100644 --- a/allscale/components/localoptimizer.hpp +++ b/allscale/components/localoptimizer.hpp @@ -3,6 +3,7 @@ #define ALLSCALE_COMPONENTS_LOCALOPTIMIZER_HPP #include + #if defined(ALLSCALE_HAVE_CPUFREQ) #include #endif @@ -14,223 +15,214 @@ #include //#define MEASURE_MANUAL_ 1 -#define MEASURE_ 1 -//#define DEBUG_ 1 - -namespace allscale { namespace components { - - enum objectiveType {time,energy,resource}; - - enum parameterType {thread, frequency}; - - enum searchPolicy {allscale, random, manual}; - - /* structure type of a single optimization objective */ - struct objective{ - objectiveType type; - /* leeway threshold desired, 0-1 double */ - double leeway; - /* non-negative integer priority of the objective, 0 is highest priority*/ - int priority; - /* local minimum during single objective optimization */ - double localmin; - /* local maximum during single objective optimization */ - double localmax; - /* local minimum during single objective optimization */ - double globalmin; - /* local minimum during single objective optimization */ - double globalmax; - /* current deviation of the objective value from observed min */ - double currentthreshold; - /* sampled objective values throughout execution */ - std::vector samples; - /* thread number that lead to the objective value in samples vector */ - std::vector threads_samples; - /* frequency index that lead to the objective value in samples vector */ - std::vector freq_samples; - /* true if optimization of objective has converged, false otherwise */ - bool converged; - /* true if optimizer for objective has been initialized, false otherwise */ - bool initialized; - /* index to the parameter vectors for setup that has so far achieved - the minimum over all samples */ - long int min_params_idx; - double converged_minimum; - double minimization_params[2]; - }; - - - /* structure type modelling an optimization actuation action to be taken +// #define MEASURE_ 1 +// #define DEBUG_ 1 +// #define DEBUG_MULTIOBJECTIVE_ 1 + +namespace allscale +{ +namespace components +{ + +enum objectiveType +{ + time, + energy, + resource +}; + +enum searchPolicy +{ + allscale, + random, + manual, + none +}; + + +/* structure type modelling an optimization actuation action to be taken by the scheduler */ - struct actuation{ - /* number of threads to resume (>0) or suspend (<0). If set to zero, - number of threads will stay unchanged. */ - unsigned int delta_threads; - -#if defined(ALLSCALE_HAVE_CPUFREQ) - /* index to the global cpu-supported frequencies vector pointing to - the new frequency to be set. If set to -1, frequency will stay - unchanged */ - unsigned int frequency_idx; -#endif - }; - - struct localoptimizer - { - localoptimizer() - :nmd(0.01), -#if defined(ALLSCALE_HAVE_CPUFREQ) - frequency_param_(0), -#endif - current_objective_idx_(0),converged_(false) - { - if (optmethod_==random) - srand (std::time(NULL)); - } - - localoptimizer(std::list); - - bool isConverged(){return converged_;} - - void setPolicy(searchPolicy pol){ - optmethod_ = pol; +struct actuation +{ + unsigned int threads; + int frequency_idx; +}; + +struct localoptimizer +{ + localoptimizer(); + bool isConverged(); + double evaluate_score(const double objectives[]); + void setPolicy(searchPolicy pol) + { + optmethod_ = pol; #ifdef DEBUG_ - std::cout << "Local Optimizer Initialized with " - << policyToString(pol) - << " policy for single objective search." - << std::endl; + std::cout << "Local Optimizer Initialized with " + << policyToString(pol) + << " policy for multi-objective search." + << std::endl; #endif - } - - searchPolicy getPolicy(){return optmethod_;} - - void setobjectives(std::list); - - std::size_t getCurrentThreads(){return threads_param_;} + } + void initialize_nmd(bool from_scratch); + searchPolicy getPolicy() { return optmethod_; } + + // VV: Modifying the objectives triggers restarting the optimizer + void setobjectives(double time_weight, + double energy_weight, + double resource_weight); + + void getobjectives(double *time_weight, + double *energy_weight, + double *resource_weight) + { + if ( time_weight != nullptr ) + *time_weight = this->time_weight; + if ( energy_weight != nullptr ) + *energy_weight = this->energy_weight; + if ( resource_weight != nullptr ) + *resource_weight = this->resource_weight; + } + + void set_objectives_scale(const double objectives_scale[3]); + + std::size_t getCurrentThreads() { return threads_param_; } + + void setCurrentThreads(std::size_t threads) { threads_param_ = threads; } + + unsigned int getCurrentFrequencyIdx() + { + return frequency_param_; + } + + void setCurrentFrequencyIdx(unsigned int idx) { frequency_param_ = idx; } + + const std::vector + setfrequencies(std::vector frequencies) + { + #if 0 + const std::size_t max_freqs = 10; + std::size_t keep_every = (std::size_t) ceilf(frequencies.size() / (float) max_freqs); + + if ( keep_every > 1 ) { + std::vector new_freqs; + + int i, j, len; + + for (j=0, i=0, len=frequencies.size(); i>>> " << el << std::endl; + return frequencies_param_allowed_; + } + + std::size_t getmaxthreads() + { + return max_threads_; + } + + void setmaxthreads(std::size_t threads); + + /* executes one step of multi-objective optimization */ + actuation step(std::size_t active_threads); + + /* adds a measurement sample to the specified objective */ + void measureObjective(double iter_time, double power, double threads); + + /* restarts multi-objective optimization from current best solution */ + void reset(int, int); - void setCurrentThreads(std::size_t threads){threads_param_ = threads;} - -#if defined(ALLSCALE_HAVE_CPUFREQ) - unsigned int getCurrentFrequencyIdx(){return frequency_param_;} - - void setCurrentFrequencyIdx(unsigned int idx){frequency_param_ = idx;} - - const std::vector - setfrequencies(std::vector frequencies){ - frequencies_param_allowed_=frequencies; - //std::cout << "**************** = " << frequency_param_ << std::endl; - //for(auto& el: frequencies_param_allowed_) - // std::cout << "***>>>> " << el << std::endl; - return frequencies_param_allowed_; - } +#ifdef DEBUG_ + void printobjectives(); + void printverbosesteps(actuation); #endif - void setmaxthreads(std::size_t threads){ - max_threads_=threads; - threads_param_=threads; - } + std::string policyToString(searchPolicy pol) + { + std::string str; + switch (pol) + { + case random: + str = "random"; + break; + case allscale: + str = "allscale"; + break; + case manual: + str = "manual"; + break; + } + return str; + } - /* executes one step of multi-objective optimization */ - actuation step(); + private: + double time_weight, energy_weight, resource_weight; - /* adds a measurement sample to the specified objective */ - void measureObjective(double iter_time, double power, double threads); + // VV: Used to convert thread_idx to actual number of threads + std::size_t threads_dt; - /* restarts multi-objective optimization from current best solution */ - void reset(int,int); + void accumulate_objective_measurements(); + void reset_accumulated_measurements(); -#ifdef DEBUG_ - void printobjectives(); - void printverbosesteps(actuation); -#endif - - std::string policyToString(searchPolicy pol){ - std::string str; - switch (pol){ - case random: - str = "random"; - break; - case allscale: - str = "allscale"; - break; - case manual: - str = "manual"; - break; - } - return str; - } + std::vector samples_energy; + std::vector samples_time; + std::vector samples_threads; + std::vector samples_freq; - private: + bool explore_knob_domain; - /* vector of active optimization objectives. Objectives are stored - in the vector in decreasing priority order */ - std::vector objectives_; + double pending_time, pending_energy, pending_threads; + unsigned long pending_num_times; - NelderMead nmd; + bool mo_initialized; - /* counts number of parameter changes (as pair) */ - unsigned long long int param_changes_; + NelderMead nmd; - /* single objective optimization method used */ - searchPolicy optmethod_ = random; + /* single objective optimization method used */ + searchPolicy optmethod_ = none; - /* active optimization parameter - nr of OS threads active */ - int threads_param_; + /* active optimization parameter - nr of OS threads active */ + int threads_param_; - /* ordered set of OS thread values that have been assigned to the + /* ordered set of OS thread values that have been assigned to the runtime by the optimization algorithm. The most recent value is stored at the end of the vector */ - std::vector thread_param_values_; + std::vector thread_param_values_; - /* maximum number of OS threads supported by the runtime */ - std::size_t max_threads_; + /* maximum number of OS threads supported by the runtime */ + std::size_t max_threads_; -#if defined(ALLSCALE_HAVE_CPUFREQ) - /* active optimization parameter - current CPU frequency index */ - unsigned int frequency_param_; + /* active optimization parameter - current CPU frequency index */ + unsigned int frequency_param_; - /* ordered set of frequency values that the CPU has been set to by - the optimization algorithm. The most recent value is stored at the - end of the vector */ - std::vector frequency_param_values_; - - /* vector containing sorted list of frequencies supported by the + /* vector containing sorted list of frequencies supported by the processor */ - std::vector frequencies_param_allowed_; - - /* index to the vector of allowed frequencies that points to the highest - frequency. The ordering of the vector, as reported by hardware - reconfiguration can be platform specific, and therefore we need this - index to make sorted access to the vector platform agnostic */ - const short unsigned int highest_frequency_allowed_idx_ = 0; -#endif - - /* threshold (percentage in [0,1]) to decide convergence of optimization - steps against a single objective */ - const double convergence_threshold_ = 0.02; - - /***** optimization state variables ******/ - - /* index to the _objectives vector of currently optimized objective */ - unsigned short int current_objective_idx_; + std::vector frequencies_param_allowed_; - /* number of times the optimizer step() has been invoked, this is for - init and housekeeping purposes */ - unsigned long long int steps_; + /* threshold (percentage in [0,1]) to decide convergence of optimization + steps */ + double convergence_threshold_; - /* currently optimized parameter */ - parameterType current_param_; + /***** optimization state variables ******/ - /* initial warm-up steps */ - const unsigned int warmup_steps_=3; + /* initial warm-up steps */ + const unsigned int warmup_steps_ = 3; - /* maximum number of optimization steps allowed */ - const int max_steps_=100; + /* set to true if local optimizer has converged over all objectives */ + bool converged_; - /* set to true if local optimizer has converged over all objectives */ - bool converged_; - }; -} -} + double objectives_scale[3]; +}; +} // namespace components +} // namespace allscale #endif diff --git a/allscale/components/nmd.hpp b/allscale/components/nmd.hpp new file mode 100644 index 0000000..7f462ff --- /dev/null +++ b/allscale/components/nmd.hpp @@ -0,0 +1,162 @@ +/* +Nelder Mead implementation for arbitrary number of knobs and number of objectives. + +Developed explicitly for non-continuous search spaces. + +Important information +--------------------- + +This implementation uses a cache coupled with the exploration-heuristic that is explained +bellow to refrain from evaluating the same set of knobs multiple times. + +If NMD proposes to explore a knob-set that has been recently evaluated (i.e. there's a +non stale entry in the cache) the heuristic will instead propose the closest point that is +enclosed within the N-dimensional (where N = num_knobs) space near the knob set that NMD +initially proposed. The N-dimensional space takes a form of a square, Cube, Hypercube for +N=2, 3, 4. Each edge may be at most @max_distance_long (see generate_unique) for more info. + +author: vasiliadis.vasilis@gmail.com +*/ +#ifndef ALLSCALE_NMD_HEADER +#define ALLSCALE_NMD_HEADER +#include +#include +#include +#include +#include + +namespace allscale { +namespace components { + +struct logistics { + std::vector objectives; + std::vector knobs; + + int64_t cache_ts, cache_dt; + + bool converged; +}; + +#define ALPHA 1.0 /* reflection coefficient */ +#define BETA 0.5 /* contraction coefficient */ +#define GAMMA 2.0 /* expansion coefficient */ +#define DELTA 0.5 /* shrinking coefficient */ + +class NmdGeneric { +public: + NmdGeneric(); + NmdGeneric(std::size_t num_knobs, std::size_t num_objectives, + double conv_threshold, int64_t cache_expire_dt_ms, + std::size_t max_iters); + + static double score_speed_efficiency_power(const double measurements[], const double weights[]) + { + double ret = std::pow(measurements[0], weights[0]) * + std::pow(measurements[1], weights[1]) * + std::pow((1-measurements[2]), weights[2]); + + if ( std::isfinite(ret) == 0 || ret > 1.0 ) { + ret = 1.0; + } + + return 1.0 - ret; + } + + void initialize(const std::size_t constraint_min[], const std::size_t constraint_max[], + const std::size_t *initial_config[], const double weights[], + double (*score_function)(const double[], const double [])); + + void ensure_profile_consistency(std::size_t expected[], const std::size_t observed[]) const; + + void set_constraints_now(const std::size_t constraint_min[], + const std::size_t constraint_max[]); + + double score(const double measurements[]) const; + + std::pair, bool> get_next(const double measurements[], + const std::size_t observed_knobs[]); + +protected: + bool test_convergence(); + + // VV: (measurements, weights) returns value in range [0.0, infinite) + // 0.0 means perfect score (i.e. the larger the score, the worse it is) + double (*score_function)(const double[], const double []); + + std::vector do_warmup(const double measurements[], + const std::size_t observed_knobs[]); + std::vector do_reflect(const double measurements[], + const std::size_t observed_knobs[]); + std::vector do_expand(const double measurements[], + const std::size_t observed_knobs[]); + std::vector do_contract_in(const double measurements[], + const std::size_t observed_knobs[]); + std::vector do_contract_out(const double measurements[], + const std::size_t observed_knobs[]); + std::vector do_shrink(); + std::vector do_start(bool consult_cache); + + void sort_simplex(bool consult_cache=true); + void compute_centroid(); + + void generate_unique(std::size_t initial[], bool accept_stale, + const std::set > *extra) const; + std::size_t compute_max_combinations() const; + + template + void apply_constraint(T knobs[]) const + { + for (auto i=0ul; i (T) constraint_max[i] ) + knobs[i] = constraint_max[i]; + } + } + + //VV: Used to generate all possible combinations of +- + // from: https://stackoverflow.com/questions/4633584/ + template + bool next_binary(Iter begin, Iter end) const + { + while (begin != end) // we're not done yet + { + --end; + if ((*end & 1) == 0) // even number is treated as zero + { + ++*end; // increase to one + return true; // still more numbers to come + } + else // odd number is treated as one + { + --*end; // decrease to zero and loop + } + } + return false; // that was the last number + } + + enum estate {warmup, start, reflect, expand, contract_in, contract_out, shrink}; + estate current_state; + std::size_t warmup_step; + + double conv_threshold; + std::size_t num_knobs; + std::size_t num_objectives; + + double *scores; + std::size_t **simplex, **initial_config; + std::size_t *constraint_max, *constraint_min; + std::size_t *point_reflect, *point_contract, *point_expand, *centroid; + std::map< std::vector, logistics> cache; + int64_t cache_expire_dt_ms; + double *weights; + std::size_t times_reentered_start; + double score_reflect, score_contract, score_expand; + bool final_explore; + std::size_t iteration, max_iters; +}; + +} +} + +#endif \ No newline at end of file diff --git a/allscale/components/nmsimplex_bbincr.hpp b/allscale/components/nmsimplex_bbincr.hpp index f894d2b..891938d 100644 --- a/allscale/components/nmsimplex_bbincr.hpp +++ b/allscale/components/nmsimplex_bbincr.hpp @@ -18,136 +18,244 @@ #include #include +#include +#include +#include + #ifdef MACOSX #include #else #include #endif -namespace allscale { namespace components { +namespace allscale +{ +namespace components +{ + +// VV: threads, freq_idx +#define NMD_NUM_KNOBS 2 +// VV: time, energy/power, resources +#define NMD_NUM_OBJECTIVES 3 + + +#if (NMD_NUM_OBJECTIVES != 3) +#error UNSUPPORTED number of Objectives +#endif + +#if (NMD_NUM_KNOBS != 2) +#error UNSUPPORTED number of Knobs +#endif + +#define MAX_IT 1000 /* maximum number of iterations */ +#define ALPHA 1.0 /* reflection coefficient */ +#define BETA 0.5 /* contraction coefficient */ +#define GAMMA 2.0 /* expansion coefficient */ +#define DELTA 0.5 /* shrinking coefficient */ -#define MAX_IT 1000 /* maximum number of iterations */ -#define ALPHA 1.0 /* reflection coefficient */ -#define BETA 0.5 /* contraction coefficient */ -#define GAMMA 2.0 /* expansion coefficient */ +#define CACHE_EXPIRE_AFTER_MS 35000 /* structure type of a single optimization step return status */ -struct optstepresult{ - /* true if optimization has converged for the specified objective */ - bool converged; - /* number of threads for parameters to set for sampling */ - double threads; - /* index to frequency vector for freq parameter to set for sampling*/ - int freq_idx; +struct optstepresult +{ + /* true if optimization has converged for the specified objective */ + bool converged; + /* number of threads for parameters to set for sampling */ + double threads; + /* index to frequency vector for freq parameter to set for sampling*/ + int freq_idx; + + /******VV: Cache stuff******/ + double objectives[3]; // (time, energy, resource) + // VV: _cache_expires denotes dt (in ms) after _cache_timestamp + int64_t _cache_timestamp, _cache_expires_dt; }; +typedef std::map, optstepresult> MapCache_t; + /* enumeration encoding state that the incremental Nelder Mead optimizer is at */ -enum iterationstates {start, reflection, expansion, contraction}; +enum iterationstates +{ + // VV: Need NMD_NUM_KNOBS + 1 values before we can start optimizing + warmup, + start, + reflection, + expansion, + contraction_in, + contraction_out, + shrink +}; + -class NelderMead { +class NelderMead +{ public: - NelderMead(double); - void initialize_simplex(double params[][2], double*,double*,double*); - void print_initial_simplex(); - void print_iteration(); - optstepresult step(double param); - double* getMinVertices(){ - return v[vs]; - } + NelderMead(const NelderMead &other); + NelderMead(double); + // VV: For the time being + // weights = [ W_time, W_energy/power, W_resources ] + // initial_simplex = double[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS] + // constraint_min = [min_threads, min_freq_idx] + void initialize_simplex(const double weights[NMD_NUM_OBJECTIVES], + const double initial_simplex[][NMD_NUM_KNOBS], + const double constraint_min[NMD_NUM_KNOBS], + const double constraint_max[NMD_NUM_KNOBS]); + /* + void initialize_simplex(const double weights[NMD_NUM_OBJECTIVES], + const double constraint_min[NMD_NUM_KNOBS], + const double constraint_max[NMD_NUM_KNOBS]); + */ + + void print_initial_simplex(); + void print_iteration(); + + void set_scale(const double scale[NMD_NUM_OBJECTIVES]); + + double *getMinVertices() + { + return v[vs]; + } + + double getMinObjective() + { + return min; + } + + // VV: Returns a [NMD_NUM_KNOS+1][NMD_NUM_KNOBS] array + void get_simplex(double simplex[][NMD_NUM_KNOBS]) { + for (auto i=0; i + void generate_new(F &gen); + enum direction {up, up_final, down, left, right, right_final}; + std::pair explore_next_extra(double *extra, int level, + direction dir, int level_max, int level_nested_max); - private: - int vg_index(); - int vs_index(); - int vh_index(); - void my_constraints(double*); - void centroid(); - bool testConvergence(); - void updateObjectives(); + //VV: objective_type: { : optstepresult } + MapCache_t cache_; - double round2(double num, int precision) - { - double rnum = 0.0; - int tnum; + void do_invalidate_cache(); + void do_reevaluate_scores(); - if (num == 0.0) - return num; + optstepresult do_step_start(); + optstepresult do_step_reflect(const double objectives[], + double knob1, double knob2); + optstepresult do_step_expand(const double objectives[], + double knob1, double knob2); + optstepresult do_step_contract_in(const double objectives[], + double knob1, double knob2); + optstepresult do_step_contract_out(const double objectives[], + double knob1, double knob2); + optstepresult do_step_shrink(); + optstepresult do_step_warmup(const double objectives[], + double knob1, double knob2); - rnum = num*pow(10,precision); - tnum = (int)(rnum < 0 ? rnum-0.5 : rnum + 0.5); - rnum = tnum/pow(10,precision); + void sort_vertices(void); + void my_constraints(double *); + void centroid(); + bool testConvergence(std::size_t tested_combinations); - return rnum; - } + // VV: Will return false if entry not in cache + bool cache_update(int threads, int freq_idx, + const double objectives[], + bool add_if_new); - /* vertex with smallest value */ - int vs; + bool convergence_reevaluating; + int initial_configurations[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS]; + double scale[NMD_NUM_OBJECTIVES]; + /* vertex with smallest value */ + int vs; - /* vertex with next smallest value */ - int vh; + /* vertex with next smallest value */ + int vh; - /* vertex with largest value */ - int vg; - - int i,j,row; + /* vertex with largest value */ + int vg; - const int n=2; + int i, j, row; - /* track the number of function evaluations */ - int k; + const int n = 2; - /* track the number of iterations */ - int itr; - - /* holds vertices of simplex */ - double **v; + /* track the number of function evaluations */ + int k; - /* value of function at each vertex */ - double *f; + /* track the number of iterations */ + int itr; - /* value of function at reflection point */ - double fr; + /* holds vertices of simplex */ + double v[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS]; - /* value of function at expansion point */ - double fe; + /* value of function at each vertex */ + double f[NMD_NUM_KNOBS+1]; - /* value of function at contraction point */ - double fc; + /* value of function at reflection point */ + double fr; - /* reflection - coordinates */ - double *vr; + /* value of function at expansion point */ + double fe; - /* expansion - coordinates */ - double *ve; + /* value of function at contraction point */ + double fc; - /* contraction - coordinates */ - double *vc; + /* reflection - coordinates */ + double vr[NMD_NUM_KNOBS]; - /* centroid - coordinates */ - double *vm; + /* expansion - coordinates */ + double ve[NMD_NUM_KNOBS]; - double min; - - double fsum,favg,s; + /* contraction - coordinates */ + double vc[NMD_NUM_KNOBS]; + + /* centroid - coordinates */ + double vm[NMD_NUM_KNOBS]; + + double min; + + double fsum, favg, s; + + double EPSILON; + + iterationstates state_; + + const int MAXITERATIONS = 15; - double EPSILON; + double constraint_min[2]; - iterationstates state_; + double constraint_max[2]; - const int MAXITERATIONS = 15; - - double constraint_min[2]; + double opt_weights[NMD_NUM_OBJECTIVES]; - double constraint_max[2]; + double next_constraint_min[NMD_NUM_KNOBS], + next_constraint_max[NMD_NUM_KNOBS]; + bool should_update_constraints = false; + int times_used_cached; }; -} -} +} // namespace components +} // namespace allscale #endif diff --git a/allscale/components/scheduler.hpp b/allscale/components/scheduler.hpp index 7eed6e5..9eb9fbf 100644 --- a/allscale/components/scheduler.hpp +++ b/allscale/components/scheduler.hpp @@ -5,8 +5,10 @@ #include #include #include + #if defined(ALLSCALE_HAVE_CPUFREQ) #include +#else #endif #include @@ -44,6 +46,8 @@ namespace allscale { namespace components { HPX_ASSERT(false); } + bool get_optimization_score(); + scheduler(std::uint64_t rank); void init(); @@ -64,6 +68,22 @@ namespace allscale { namespace components { return active_threads; } + std::size_t get_total_threads() const { + return os_thread_count; + } + + void set_local_optimizer_weights(double time_weight, + double energy_weight, + double resource_weight); + void get_local_optimizer_weights(double *time_weight, + double *energy_weight, + double *resource_weight); + + void update_max_threads(std::size_t max_threads); + + double get_last_objective_score() { + return last_objective_score; + } private: std::size_t get_num_numa_nodes(); @@ -82,18 +102,19 @@ namespace allscale { namespace components { bool do_split(work_item const& work, std::size_t numa_node); bool collect_counters(); - //try to suspend resource_step threads, return number of threads which received a new suspend order; - // REM unsigned int suspend_threads(); - unsigned int suspend_threads(std::size_t); - //try to resume resource_step threads, return number of threads which received a new resume order; - // REM unsigned int resume_threads(); - unsigned int resume_threads(std::size_t); + //try to suspend threads, return number of threads which received a new suspend order; + unsigned int suspend_threads(std::size_t); + + //try to resume threads, return number of threads which received a new resume order; + unsigned int resume_threads(std::size_t); #ifdef MEASURE_ // convenience methods to update measured metrics of interest - void update_active_osthreads(std::size_t); - void update_power_consumption(std::size_t); + void update_active_osthreads(std::size_t threads, int64_t delta_time); + void update_power_consumption(std::size_t power_sample, int64_t delta_time); #endif + double last_objective_score; + int64_t last_measure_power, last_measure_threads; void fix_allcores_frequencies(int index); @@ -108,7 +129,7 @@ namespace allscale { namespace components { long last_optimization_timestamp_; /* periodicity in milliseconds to invoke the optimizer */ - const long optimization_period_ms = 5; + const long optimization_period_ms = 1000; /* captures absolute timestamp of the last time optimization objective value have been measured (sampled) */ @@ -117,7 +138,7 @@ namespace allscale { namespace components { long last_objective_measurement_timestamp_; /* periodicity in milliseconds to invoke objective sampling */ - const long objective_measurement_period_ms = 1; + const long objective_measurement_period_ms = 500; //extra masks to better handle suspending/resuming threads std::vector thread_pools_; @@ -153,19 +174,12 @@ namespace allscale { namespace components { unsigned long long last_power_usage; unsigned long long power_sum; unsigned long long power_count; + #if defined(ALLSCALE_HAVE_CPUFREQ) cpufreq_policy policy; hardware_reconf::hw_topology topo; - std::vector cpu_freqs; - // Indices correspond to the freq id in cpu_freqs, and - // each pair holds energy usage and execution time - std::vector> freq_times; - std::vector>> objectives_status; - unsigned int freq_step; - bool target_freq_found; #endif - unsigned int resource_step; - bool target_resource_found; + std::vector cpu_freqs; mutable mutex_type throttle_mtx_; mutable mutex_type resize_mtx_; @@ -186,9 +200,9 @@ namespace allscale { namespace components { bool resource_requested; bool energy_requested; - double time_leeway; - double resource_leeway; - double energy_leeway; + double time_weight; + double resource_weight; + double energy_weight; unsigned int period_for_time; unsigned int period_for_resource; unsigned int period_for_power; diff --git a/allscale/dashboard.hpp b/allscale/dashboard.hpp index 73670a2..eb77398 100644 --- a/allscale/dashboard.hpp +++ b/allscale/dashboard.hpp @@ -89,8 +89,10 @@ namespace allscale { namespace dashboard // current power usage / max power usage \in [0..1] float power = 0; - + std::string to_json() const; + + float last_local_score; template void serialize(Archive& ar, unsigned); diff --git a/allscale/optimizer.hpp b/allscale/optimizer.hpp index fc64428..a255497 100644 --- a/allscale/optimizer.hpp +++ b/allscale/optimizer.hpp @@ -11,6 +11,8 @@ #include #include +#include + #include #include @@ -23,6 +25,7 @@ namespace allscale { float avg_time_; unsigned long long energy_; std::uint64_t active_frequency_; + std::size_t active_cores_per_node_; std::size_t cores_per_node_; template @@ -33,6 +36,7 @@ namespace allscale { ar & avg_time_; ar & energy_; ar & active_frequency_; + ar & active_cores_per_node_; ar & cores_per_node_; } }; @@ -87,16 +91,35 @@ namespace allscale { , f_resource_max(other.f_resource_max) , f_resource_leeway(other.f_resource_leeway) , o_ino(std::move(o_ino)) - {} + // VV: Used by balance_ino_nmd + , nmd_initialized(other.nmd_initialized) + , nmd(other.nmd) + , nodes_min(other.nodes_min) + , nodes_max(other.nodes_max) + , threads_min(other.threads_min) + , threads_max(other.threads_max) + , previous_num_nodes(other.previous_num_nodes) + , use_lopt(other.use_lopt) + , last_optimization_score(other.last_optimization_score) + { + objectives_scale[0] = other.objectives_scale[0]; + objectives_scale[1] = other.objectives_scale[1]; + objectives_scale[2] = other.objectives_scale[2]; + } bool active() const { return active_; } + double get_optimization_score(); + hpx::future balance(bool); hpx::future balance_ino(const std::vector &old_mapping); + hpx::future balance_ino_nmd(const std::vector &old_mapping); hpx::future decide_random_mapping(const std::vector &old_mapping); + + void signal_objective_changed(); bool may_rebalance(); @@ -104,7 +127,7 @@ namespace allscale { std::size_t u_steps_till_rebalance; void tune(std::vector const& state); - + int nmd_initialized; std::vector active_nodes_; std::uint64_t active_frequency_; @@ -118,9 +141,17 @@ namespace allscale { std::vector localities_; + // VV: balance_ino and balance_global data float f_resource_max, f_resource_leeway; + std::size_t previous_num_nodes; + int nodes_min, nodes_max, threads_min, threads_max; components::internode_optimizer_t o_ino; + + components::NelderMead nmd; + double last_optimization_score; + double objectives_scale[3]; + bool use_lopt; }; } diff --git a/allscale/scheduler.hpp b/allscale/scheduler.hpp index 8cf6006..f448ad5 100644 --- a/allscale/scheduler.hpp +++ b/allscale/scheduler.hpp @@ -48,6 +48,7 @@ namespace allscale static HPX_EXPORT void update_policy(task_times const& times, std::vector mask, std::uint64_t frequency); static void apply_new_mapping(const std::vector &new_mapping); + static void update_max_threads(std::size_t max_threads); static HPX_EXPORT void schedule(work_item&& work); static HPX_EXPORT components::scheduler* run(std::size_t rank); diff --git a/allscale/tuner.hpp b/allscale/tuner.hpp index da28253..f1285a8 100644 --- a/allscale/tuner.hpp +++ b/allscale/tuner.hpp @@ -3,6 +3,7 @@ #define ALLSCALE_TUNER_HPP #include +#include #include #include @@ -74,6 +75,24 @@ namespace allscale { void next_direction(); }; + + struct nmd_optimizer : tuner + { + nmd_optimizer(std::size_t nodes_min, std::size_t nodes_max); + components::NmdGeneric nmd; + std::vector avail_freqs; + std::vector best; + bool converged; + bool initialized; + // VV: even though NmdGeneric supports arbitrary number of optimization parameters + // we're applying it to number of nodes and CPU frequency, it is trivial to + // add number of threads + std::size_t constraint_min[2], constraint_max[2]; + + tuner_configuration next(tuner_configuration const& current_cfg, tuner_state const& current_state, tuning_objective) override; + + double previous_weights[3]; + }; } #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 25cf7c9..1481fbf 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -25,6 +25,7 @@ set(_srcs components/util/graph_colouring.cpp components/localoptimizer.cpp components/nmsimplex_bbincr.cpp + components/nmd.cpp ) if(CPUFREQ_FOUND) diff --git a/src/components/localoptimizer.cpp b/src/components/localoptimizer.cpp index 86faa91..da51330 100644 --- a/src/components/localoptimizer.cpp +++ b/src/components/localoptimizer.cpp @@ -15,432 +15,397 @@ //#define DEBUG_MULTIOBJECTIVE_ 1 //#define DEBUG_CONVERGENCE_ 1 //#define MEASURE_MANUAL 1 // define to generate output consumed by the regression test -#define MEASURE_ 1 +//#define MEASURE_ 1 // only meant to be defined if one needs to measure the efficacy // of the scheduler //#define ALLSCALE_HAVE_CPUFREQ 1 -#define ALLSCALE_USE_CORE_OFFLINING 1 - -namespace allscale { -namespace components { - -localoptimizer::localoptimizer(std::list targetobjectives) - : objectives_((int)targetobjectives.size()), - nmd(0.01), - param_changes_(0), - steps_(0), - current_param_(thread), - converged_(false) - { - for (objective o : targetobjectives) { - //std::cout << o.type << "," << o.leeway << "," << o.priority << '\n'; - objectives_[o.priority] = o; - objectives_[o.priority].localmin=10000; - objectives_[o.priority].globalmin=10000; - objectives_[o.priority].localmax=0.0; - objectives_[o.priority].globalmax=0.0; - objectives_[o.priority].converged=false; - objectives_[o.priority].initialized=false; - objectives_[o.priority].min_params_idx=0; - objectives_[o.priority].converged_minimum=0; - } -#ifdef ALLSCALE_HAVE_CPUFREQ - setCurrentFrequencyIdx(0); -#endif -}; -void localoptimizer::setobjectives(std::list targetobjectives){ - objectives_.clear(); - objectives_.resize((int)targetobjectives.size()); - for (objective o : targetobjectives) { - //std::cout << o.type << "," << o.leeway << "," << o.priority << '\n'; - objectives_[o.priority] = o; - objectives_[o.priority].localmin=10000; - objectives_[o.priority].globalmin=10000; - objectives_[o.priority].localmax=0.0; - objectives_[o.priority].globalmax=0.0; - objectives_[o.priority].converged=false; - objectives_[o.priority].initialized=false; - objectives_[o.priority].min_params_idx=0; - objectives_[o.priority].converged_minimum=0; - } - steps_=0; - param_changes_=0; - current_param_=thread; -#ifdef ALLSCALE_HAVE_CPUFREQ - setCurrentFrequencyIdx(0); -#endif - converged_=false; -} +namespace allscale +{ +namespace components +{ -void localoptimizer::reset(int threads, int freq_idx){ - threads_param_ = threads; - param_changes_=0; - thread_param_values_.clear(); -#ifdef ALLSCALE_HAVE_CPUFREQ - frequency_param_= freq_idx; - frequency_param_values_.clear(); -#endif - current_objective_idx_=0; - steps_=0; - current_param_=thread; - converged_=false; -}; +localoptimizer::localoptimizer() + + : pending_threads(0.), + pending_energy(0.), + pending_time(0.), + pending_num_times(0.), + mo_initialized(false), + frequency_param_(0), + converged_(false), + convergence_threshold_(0.005), + time_weight(0.0), + energy_weight(0.0), + resource_weight(0.0), + nmd(0.005) + { + if (optmethod_ == random) + srand(std::time(NULL)); + + // VV: Start with 500ms as the guestimation of max iteration time + objectives_scale[0] = 0.5; + objectives_scale[1] = 1.0; + objectives_scale[2] = 1.0; + + nmd.set_scale(objectives_scale); + } + +double localoptimizer::evaluate_score(const double objectives[]) +{ + if ( mo_initialized ) { + return nmd.evaluate_score(objectives, nullptr); + } -#ifdef DEBUG_ -void localoptimizer::printobjectives(){ - for(auto& el: objectives_){ - std::cout << "Objective" << "\t\t" << "Priority" << "\t\t" << "Leeway" << - std::endl; - switch (el.type){ - case time: - std::cout << "Time" << "\t\t" << el.priority << "\t\t" << el.leeway << - std::endl; - break; - case energy: - std::cout << "Energy" << "\t\t" << el.priority << "\t\t" << el.leeway << - std::endl; - break; - case resource: - std::cout << "Resource" << "\t\t" << el.priority << "\t\t" << el.leeway << - std::endl; - break; - } - } + return -1.0; } - -void localoptimizer::printverbosesteps(actuation act){ - std::cout << "[INFO]"; - if (optmethod_==random) - std::cout << "Random "; - else if (optmethod_==allscale){ - std::cout << "Allscale "; - } - std::cout << "Scheduler Step: Setting OS Threads to " << threads_param_; -#ifdef ALLSCALE_HAVE_CPUFREQ - std::cout << ", CPU Frequency to " << frequencies_param_allowed_[act.frequency_idx] - << std::endl; -#else - std::cout << std::endl; -#endif - +void localoptimizer::setobjectives(double time_weight, + double energy_weight, + double resource_weight) +{ + this->time_weight = time_weight; + this->energy_weight = energy_weight; + this->resource_weight = resource_weight; + + // VV: Modifying the objectives triggers restarting the optimizer + // from scratch + + mo_initialized = false; + converged_ = false; } -#endif +void localoptimizer::reset(int threads, int freq_idx) +{ + threads_param_ = threads; + thread_param_values_.clear(); -void localoptimizer::measureObjective(double iter_time, double power, double threads){ - for(auto& el: objectives_){ - switch (el.type){ - case time: - el.samples.insert(el.samples.begin(),iter_time); - if (el.samples.size()>1000) - el.samples.resize(500); - - el.threads_samples.insert(el.threads_samples.begin(),threads); - if (el.threads_samples.size()>1000) - el.threads_samples.resize(500); - -#ifdef ALLSCALE_HAVE_CPUFREQ - el.freq_samples.insert(el.freq_samples.begin(),getCurrentFrequencyIdx()); - if (el.freq_samples.size()>1000) - el.freq_samples.resize(500); -#endif + frequency_param_ = freq_idx; + converged_ = false; +}; - if (el.globalmin > iter_time){ - el.globalmin = iter_time; - el.min_params_idx=param_changes_; - } - if (el.globalmax < iter_time) - el.globalmax = iter_time; -#ifdef DEBUG__ - std::cout << "Iteration Time Minimum: " << el.globalmin << std::endl; - std::cout << "Iteration Time Maximum: " << el.globalmax << std::endl; - std::cout << "Iteration Time Samples: "; - for(auto& samp: el.samples) - std::cout << samp << ","; - std::cout << std::endl; -#endif - break; - case energy: - el.samples.insert(el.samples.begin(),power); - if (el.samples.size()>1000) - el.samples.resize(500); - - el.threads_samples.insert(el.threads_samples.begin(),threads); - if (el.threads_samples.size()>1000) - el.threads_samples.resize(500); - -#ifdef ALLSCALE_HAVE_CPUFREQ - el.freq_samples.insert(el.freq_samples.begin(),getCurrentFrequencyIdx()); - if (el.freq_samples.size()>1000) - el.freq_samples.resize(500); +#ifdef DEBUG_ +void localoptimizer::printobjectives() +{ + std::cout << "[LocalOptimizer|DEBUG] Weights=[time:" << time_weight + << ", energy:" << energy_weight + << ", resource:" << resource_weight << "]" << std::endl << std::flush; +} #endif - if (el.globalmin > power){ - el.globalmin = power; - el.min_params_idx=param_changes_; - } - if (el.globalmax < power) - el.globalmax = power; -#ifdef DEBUG__ - std::cout << "Power Consumption Minimum: " << el.globalmin << std::endl; - std::cout << "Power Consumption Maximum: " << el.globalmax << std::endl; - std::cout << "Power Consumption Samples: "; - for(auto& samp: el.samples) - std::cout << samp << ","; - std::cout << std::endl; -#endif - break; - case resource: - el.samples.insert(el.samples.begin(),threads); - if (el.samples.size()>1000) - el.samples.resize(500); - - el.threads_samples.insert(el.threads_samples.begin(),threads); - if (el.threads_samples.size()>1000) - el.threads_samples.resize(500); - -#ifdef ALLSCALE_HAVE_CPUFREQ - el.freq_samples.insert(el.freq_samples.begin(),getCurrentFrequencyIdx()); - if (el.freq_samples.size()>1000) - el.freq_samples.resize(500); +bool localoptimizer::isConverged() +{ + #if 0 + // VV: This is an attempt to make optimization choices for + // tasks of smaller granularity (after splitting a task) + if ( converged_ == false ) { + return false; + } + + auto timestamp_now = std::chrono::time_point_cast(std::chrono::system_clock::now()).time_since_epoch().count(); + + if ( reexplore_every_ms >0 && timestamp_now - last_convergence_ts > reexplore_every_ms ) + { + std::cout << "[LOCALOPTIMIZER] Re-exploring space!" << std::endl; + initialize_nmd(); + } + #endif + return converged_; +} +#ifdef DEBUG_ +void localoptimizer::printverbosesteps(actuation act) +{ + static int last_frequency_idx = 0; + + std::cout << "[INFO]"; + if (optmethod_ == random) + std::cout << "Random "; + else if (optmethod_ == allscale) + { + std::cout << "Allscale "; + } + std::cout << "Scheduler Step: Setting OS Threads to " << threads_param_; + + if (act.frequency_idx >= 0) + last_frequency_idx = act.frequency_idx; + std::cout << " , CPU Frequency to " << frequencies_param_allowed_[last_frequency_idx] + << std::endl; +} #endif - if (el.globalmin > threads){ - el.globalmin = threads; - el.min_params_idx=param_changes_; - } - if (el.globalmax < threads) - el.globalmax = threads; -#ifdef DEBUG__ - std::cout << "Threads Minimum: " << el.globalmin << std::endl; - std::cout << "Threads Maximum: " << el.globalmax << std::endl; - std::cout << "Threads Samples: "; - for(auto& samp: el.samples) - std::cout << samp << ","; - std::cout << std::endl; -#endif - break; - } - } +void localoptimizer::accumulate_objective_measurements() +{ + if (pending_num_times) + { + pending_time /= (double)pending_num_times; + pending_threads /= (double)(pending_num_times*threads_dt); + pending_energy /= (double)pending_num_times; + pending_num_times = 0; + } } -actuation localoptimizer::step() +void localoptimizer::setmaxthreads(std::size_t threads) { - steps_++; - actuation act; - act.delta_threads=threads_param_; -#ifdef ALLSCALE_HAVE_CPUFREQ - act.frequency_idx=frequency_param_; -#endif - /* random optimization step */ - if (optmethod_ == random) - { - act.delta_threads = (rand() % max_threads_) - threads_param_; -#ifdef ALLSCALE_HAVE_CPUFREQ - act.frequency_idx = rand() % frequencies_param_allowed_.size(); - if (act.frequency_idx == frequency_param_) - act.frequency_idx = -1; -#endif - } - - else if (optmethod_ == allscale) - { - if (current_objective_idx_ > objectives_.size()) - return act; + max_threads_=threads; + threads_param_=threads; + + #if 0 + double threads_tick = threads / 5.; + + if ( threads_tick < 1.0 ) + threads_tick = 1.0; + + threads_dt = (int) round(threads_tick); + #elif 0 + if ( max_threads_ <= 4 ) + threads_dt = 1.; + else if ( max_threads_ <= 8 ) + threads_dt = 2.; + else if ( max_threads_ <= 32 ) + threads_dt = 4.; + else + threads_dt = 8.; + #else + threads_dt = 1.; + #endif + + if ( mo_initialized ) { + if ( converged_ == false ) { + initialize_nmd(true); + } else { + double factor; + int min_freq = 0; + int max_freq = frequencies_param_allowed_.size() - 1; + + if ( time_weight >= energy_weight + resource_weight) { + factor = 0.5; + min_freq = frequencies_param_allowed_.size() / 4; + } + else { + factor = 0.25; + max_freq = max_freq / 2; + } + + int min_threads = factor * max_threads_/((double)threads_dt); + + if ( min_threads < 1 ) + min_threads = 1; + + double constraint_min[] = {(double) min_threads, (double) min_freq}; + #if defined(ALLSCALE_HAVE_CPUFREQ) + double constraint_max[] = {ceil(max_threads_/(double)threads_dt), + (double)max_freq}; + #else + std::cout << "Allowed frequencies: " << frequencies_param_allowed_.size() << std::endl; + double constraint_max[] = {ceil(max_threads_/(double)threads_dt), + 0.0}; + #endif + + nmd.update_constraints(constraint_min, constraint_max); + } + } +} - if (steps_ < warmup_steps_) - { +void localoptimizer::initialize_nmd(bool from_scratch) +{ + // VV: Place constraints to #threads and cpu_freq tunable knobs + double factor; + int min_freq = 0; + int max_freq = frequencies_param_allowed_.size() - 1; + + if ( time_weight >= energy_weight + resource_weight) { + factor = 0.5; + min_freq = frequencies_param_allowed_.size() / 4; + } + else { + factor = 0.25; + max_freq = max_freq / 2; + } + + int min_threads = factor * max_threads_/((double)threads_dt); + + if ( min_threads < 1 ) + min_threads = 1; + int max_threads = max_threads_; + + double constraint_min[] = { (double) min_threads, (double) min_freq}; + #if defined(ALLSCALE_HAVE_CPUFREQ) + double constraint_max[] = {ceil(max_threads_/(double)threads_dt), + (double)max_freq}; + #else + std::cout << "Allowed frequencies: " << frequencies_param_allowed_.size() << std::endl; + double constraint_max[] = {ceil(max_threads_/(double)threads_dt), + 0.0}; + #endif + const double opt_weights[] = { time_weight, energy_weight, resource_weight }; + + nmd.set_scale(objectives_scale); + + if( from_scratch == false ){ + double prev_simplex[NMD_NUM_KNOBS+1][NMD_NUM_KNOBS]; + + nmd.get_simplex(prev_simplex); + + nmd.initialize_simplex(opt_weights, + prev_simplex, + constraint_min, + constraint_max); + } else { + if ( time_weight >= energy_weight + resource_weight ) { + double initial_simplex[3][2] = { + {(double) min_threads, constraint_min[1]}, + {max_threads/2.0, (constraint_min[1]+constraint_max[1])/2.0}, + {(min_threads+max_threads)/2., constraint_max[1]} + }; + nmd.initialize_simplex(opt_weights, + initial_simplex, + constraint_min, + constraint_max); + } else { + double initial_simplex[3][2] = { + {(double) min_threads, constraint_min[1]}, + {max_threads/2.0, (constraint_min[1]+constraint_max[1])/2.0}, + {(min_threads+max_threads)/2., constraint_max[1]} + }; + + nmd.initialize_simplex(opt_weights, + initial_simplex, + constraint_min, + constraint_max); + } + } + + mo_initialized = true; + explore_knob_domain = true; + converged_ = false; +} -#ifdef DEBUG_MULTIOBJECTIVE_ - std::cout << "[LOCALOPTIMIZER|INFO] Optimizer No-OP: either at warm-up or optimizer has completed\n"; -#endif - // set some random parametrization to collect at least 3 different - // vertices to be used as input to the optimizer - act.delta_threads = rand() % max_threads_; -#ifdef ALLSCALE_HAVE_CPUFREQ - act.frequency_idx = rand() % frequencies_param_allowed_.size(); -#endif - return act; - } +void localoptimizer::set_objectives_scale(const double objectives_scale[3]) +{ + for (auto i=0ul; iobjectives_scale[i] = objectives_scale[i]; + + nmd.set_scale(objectives_scale); +} - // iterate over all objectives in decreasing priority - objective obj = objectives_[current_objective_idx_]; +void localoptimizer::measureObjective(double iter_time, double power, double threads) +{ + // VV: iter_time has no bound, threads has bound @max_threads_ + // and power 1.0 + + std::cout << "Measuring objective: " + << iter_time << " " + << power << " " + << threads << std::endl; + if ( objectives_scale[0] < iter_time ) { + objectives_scale[0] = iter_time * 1.1; + set_objectives_scale(objectives_scale); + } + + pending_time += iter_time; + pending_energy += power; + pending_threads += threads / max_threads_; + pending_num_times++; +} - // initialize optimizer for this objective, if not already done so - if (!obj.initialized) - { -#ifdef DEBUG_MULTIOBJECTIVE_ - std::cout << "[LOCALOPTIMIZER|INFO] Initializing optimizer for new objective\n"; - std::cout << "[LOCALOPTIMIZER|DEBUG] Samples: " << std::flush; - for (auto& sam: obj.samples) - { - std::cout << sam << "," << std::flush; - } - std::cout << "\n" << std::flush; - - std::cout << "[LOCALOPTIMIZER|DEBUG] Thread Param of Samples: " << std::flush; - for (auto& sam: obj.threads_samples) - { - std::cout << sam << "," << std::flush; - } - std::cout << "\n" << std::flush; - -#ifdef ALLSCALE_HAVE_CPUFREQ - std::cout << "[LOCALOPTIMIZER|DEBUG] Freq Param of Samples: " << std::flush; - for (auto& sam: obj.freq_samples){ - std::cout << sam << "," << std::flush; - } - std::cout << "\n" << std::flush; -#endif -#endif - int samplenr = obj.samples.size(); -#ifdef ALLSCALE_HAVE_CPUFREQ - double params[3][2]={ - {obj.threads_samples[samplenr-1],obj.freq_samples[samplenr-1]}, - {obj.threads_samples[samplenr-2],obj.freq_samples[samplenr-2]}, - {obj.threads_samples[samplenr-3],obj.freq_samples[samplenr-3]}, - }; - double values[3]={obj.samples[samplenr-1],obj.samples[samplenr-2],obj.samples[samplenr-3]}; - - double constraint_min[]={1,0}; - double constraint_max[]={(double)max_threads_, - (double)frequencies_param_allowed_.size()}; - - nmd.initialize_simplex(params,values,constraint_min,constraint_max); - objectives_[current_objective_idx_].initialized=true; -#endif - } +void localoptimizer::reset_accumulated_measurements() +{ + pending_time = 0.; + pending_energy = 0.; + pending_threads = 0.; + pending_num_times = 0; +} -#ifdef DEBUG_MULTIOBJECTIVE_ - std::cout << "[LOCALOPTIMIZER|DEBG] Current Optimized Objective ="; - switch (obj.type) - { - case energy: - std::cout << "********** Energy\n" << std::flush; - break; - case time: - std::cout << "&&&&&&&&&& Time\n" << std::flush; - break; - case resource: - std::cout << "oooooooooo Resource\n" << std::flush; - break; - } - std::cout << "[LOCALOPTIMIZER|DEBUG] Samples: " << std::flush; - for (auto& sam: obj.samples) - { - std::cout << sam << "," << std::flush; - } - std::cout << "\n" << std::flush; - - std::cout << "[LOCALOPTIMIZER|DEBUG] Freq Param of Samples: " << std::flush; -#ifdef ALLSCALE_HAVE_CPUFREQ - for (auto& sam: obj.freq_samples) - { - std::cout << sam << "," << std::flush; - } - std::cout << "\n" << std::flush; -#endif -#endif +actuation localoptimizer::step(std::size_t active_threads) +{ + actuation act; + // VV: Possibly amend erroneous information + threads_param_ = active_threads; + act.threads = threads_param_; + + act.frequency_idx = frequency_param_; + + /* random optimization step */ + if (optmethod_ == random) + { + act.threads = (rand() % max_threads_); + act.frequency_idx = rand() % frequencies_param_allowed_.size(); + } + else if (optmethod_ == allscale) + { + // VV: Keep track of dirty objectives + if (mo_initialized == false) + initialize_nmd(true); + + accumulate_objective_measurements(); + const double latest_measurements[] = {pending_time, + pending_energy, + pending_threads}; + reset_accumulated_measurements(); + + if ( converged_ == false ){ + optstepresult nmd_res = nmd.step(latest_measurements, + active_threads, + frequency_param_); - optstepresult nmd_res = nmd.step(obj.samples[0]); #ifdef DEBUG_MULTIOBJECTIVE_ - std::cout << "[LOCALOPTIMIZER|DEBUG] Calling NMD Optimizer Step, Param = \n"; - std::cout << "[LOCALOPTIMIZER|DEBUG] New Vertex to try: "; - std::cout << "Threads = " << nmd_res.threads; -#ifdef ALLSCALE_HAVE_CPUFREQ - std::cout << " Freq Idx = " << nmd_res.freq_idx << std::endl; -#endif - std::cout << "Converg Thresh = " << convergence_threshold_ << std::endl; -#endif - if (nmd_res.converged) - { - objectives_[current_objective_idx_].converged = true; - objectives_[current_objective_idx_].converged_minimum = nmd.getMinObjective(); - double* minimization_point = nmd.getMinVertices(); - objectives_[current_objective_idx_].minimization_params[0]= - minimization_point[0]; - objectives_[current_objective_idx_].minimization_params[1]= - minimization_point[1]; -#ifdef DEBUG_CONVERGENCE_ - std::cout << "[LOCALOPTIMIZER|INFO] NMD convergence\n"; - std::cout << "******************************************" << std::endl; - std::cout << "[LOCALOPTIMIZER|INFO] Minimal Objective Value = " << - objectives_[current_objective_idx_].converged_minimum << - "Threads = " << minimization_point[0] << "Freq_idx = " << minimization_point[1] << - std::endl; - std::cout << "******************************************" << std::endl; -#endif - act.delta_threads=minimization_point[0]; -#ifdef ALLSCALE_HAVE_CPUFREQ - act.frequency_idx=minimization_point[1]; + std::cout << "[LOCALOPTIMIZER|DEBUG] New Vertex to try:"; + std::cout << " Threads = " << nmd_res.threads; + std::cout << " Freq Idx = " << nmd_res.freq_idx << std::endl; + std::cout << " Converge Thresh = " << convergence_threshold_ << std::endl; #endif - current_objective_idx_++; - if (current_objective_idx_ == objectives_.size()) - { - converged_=true; -#ifdef DEBUG_CONVERGENCE_ - std::cout << "[LOCALOPTIMIZER|INFO] ALL OBJECTIVES HAVE CONVERGED " << std::endl; -#endif - } - } - else - { - // if a higher priority objective starts getting off leeway margin, - // decide convergence of the current param at this parameter point - if (current_objective_idx_>0) - for (int i=0;i max_leeway_value && - priority_obj.samples[1] > max_leeway_value) - { - objectives_[current_objective_idx_].converged = true; - objectives_[current_objective_idx_].converged_minimum = nmd.getMinObjective(); - double* minimization_point = nmd.getMinVertices(); - objectives_[current_objective_idx_].minimization_params[0]= - minimization_point[0]; - objectives_[current_objective_idx_].minimization_params[1]= - minimization_point[1]; + if (nmd_res.converged) + { + double min_score = nmd.getMinObjective(); + double *minimization_point = nmd.getMinVertices(); #ifdef DEBUG_CONVERGENCE_ - std::cout << "[LOCALOPTIMIZER|INFO] Leeway convergence\n"; - std::cout << "******************************************" << std::endl; - std::cout << "[LOCALOPTIMIZER|INFO] Minimal Objective Value = " << - objectives_[current_objective_idx_].converged_minimum << - "Threads = " << minimization_point[0] << "Freq_idx = " << minimization_point[1] << - std::endl; - std::cout << "******************************************" << std::endl; -#endif - // find the parameter point that scores the leeway margin value - act.delta_threads = (int)priority_obj.minimization_params[0]* - (max_leeway_value/priority_obj.converged_minimum); -#ifdef ALLSCALE_HAVE_CPUFREQ - act.frequency_idx = (int)priority_obj.minimization_params[1]* - (max_leeway_value/priority_obj.converged_minimum); -#endif - //act.delta_threads=minimization_point[0]; - //act.frequency_idx=minimization_point[1]; - current_objective_idx_++; - if (current_objective_idx_ == objectives_.size()) - { - converged_=true; -#ifdef DEBUG_CONVERGENCE_ - std::cout << "[LOCALOPTIMIZER|INFO] ALL OBJECTIVES HAVE CONVERGED " << std::endl; + std::cout << "[LOCALOPTIMIZER|INFO] NMD convergence\n"; + std::cout << "******************************************" << std::endl; + std::cout << "[LOCALOPTIMIZER|INFO] Minimal Objective Value = " << min_score << " Threads = " << minimization_point[0] << " Freq_idx = " << minimization_point[1] << std::endl; + std::cout << "******************************************" << std::endl; #endif - } - return act; - } - } - act.delta_threads=(nmd_res.threads==0)?getCurrentThreads():nmd_res.threads; -#ifdef ALLSCALE_HAVE_CPUFREQ - act.frequency_idx=nmd_res.freq_idx; + act.threads = minimization_point[0]; + act.frequency_idx = minimization_point[1]; + + // VV: Stop searching for new knob_set + converged_ = true; + } else { + // VV: Have not converged yet, keep exploring + act.threads = nmd_res.threads; + act.frequency_idx = nmd_res.freq_idx; + } + + act.threads *= threads_dt; + + threads_param_ = act.threads; +#ifdef DEBUG_MULTIOBJECTIVE_ + std::cout << "[LOCALOPTIMIZER|DEBUG] ACTUAL Vertex to try:"; + std::cout << " Threads = " << act.threads; + std::cout << " Freq Idx = " << act.frequency_idx << std::endl; #endif - } - } - return act; -} -} + } + } +validate_act: + + if (act.threads > max_threads_) + { + act.threads = max_threads_; + } + else if (act.threads < 1) + { + act.threads = getCurrentThreads(); + } + + // VV: If freq_idx is -1 then set it to last used frequency (frequency_param_) + if (act.frequency_idx < 0) + act.frequency_idx = frequency_param_; + else if (act.frequency_idx > frequencies_param_allowed_.size() - 1) + act.frequency_idx = frequencies_param_allowed_.size() - 1; + + threads_param_ = act.threads; + frequency_param_ = act.frequency_idx; + + return act; } +} // namespace components +} // namespace allscale diff --git a/src/components/monitor_component.cpp b/src/components/monitor_component.cpp index 3b6e4b5..5ae6463 100644 --- a/src/components/monitor_component.cpp +++ b/src/components/monitor_component.cpp @@ -26,6 +26,11 @@ #include +#ifdef ALLSCALE_HAVE_CPUFREQ +#define POWER_MEASUREMENT_PERIOD_MS 100 +#include +#endif + #ifdef HAVE_PAPI #include #include @@ -353,16 +358,55 @@ namespace allscale { namespace components { float monitor::get_current_power() { + #ifdef ALLSCALE_HAVE_CPUFREQ + /*VV: Read potentially multiple measurements of power within the span of + POWER_MEASUREMENT_PERIOD_MS milliseconds. Each time this function + is invoked it returns the running average of power.*/ + static mutex_type power_mtx; + static unsigned long long times_read_power=0; + static unsigned long long power_sum = 0ull; + static long timestamp_reset_power = 0; + + int64_t t_now, dt; + float ret; + + std::lock_guard lock(power_mtx); + + t_now = std::chrono::time_point_cast(std::chrono::system_clock::now()).time_since_epoch().count(); + + dt = t_now - timestamp_reset_power; + times_read_power ++; + + power_sum += util::hardware_reconf::read_system_power(); + + ret = power_sum / (float)(times_read_power); + + if ( dt >= POWER_MEASUREMENT_PERIOD_MS ) { + times_read_power = 0; + power_sum = 0ull; + timestamp_reset_power = t_now; + } + + return ret; + #else return allscale::power::estimate_power(get_current_freq(0)) * num_cpus_; + #endif } float monitor::get_max_power() { -#ifdef POWER_ESTIMATE +#if defined(ALLSCALE_HAVE_CPUFREQ) + // VV: report 1100 Watts + // ( redbox paper 5283 for 8335-GTA indicates 1875 for the + // whole node but I've noticed up to ~1100 Watts, for + // the time being this is a good enough figure ) + // ( this should be dynamically configured/discovered ) + return 1100.0; +#elif defined(POWER_ESTIMATE) return allscale::power::estimate_power(get_max_freq(0)) * num_cpus_; #else - return 0.0; + return 125.0; #endif } diff --git a/src/components/nmd.cpp b/src/components/nmd.cpp new file mode 100644 index 0000000..bb59b1a --- /dev/null +++ b/src/components/nmd.cpp @@ -0,0 +1,873 @@ +#include +#include +#include +#include +#include +#include +#include + + +//#define NMD_DEBUG_ +//#define NMD_INFO_ + +#ifdef NMD_DEBUG_ +#define OUT_DEBUG(X) X +#ifndef NMD_INFO_ + #define NMD_INFO_ +#endif +#else +#define OUT_DEBUG(X) {} +#endif + +#if defined(NMD_INFO_) +#define OUT_INFO(X) X +#else +#define OUT_INFO(X) {} +#endif + + +using namespace allscale::components; + +NmdGeneric::NmdGeneric() +: +current_state(warmup), warmup_step(0), +conv_threshold(0), num_knobs(0), num_objectives(0), +scores(nullptr), simplex(nullptr), initial_config(nullptr), +constraint_max(nullptr), constraint_min(nullptr), +point_reflect(nullptr), point_contract(nullptr), weights(nullptr) +{} + +NmdGeneric::NmdGeneric(std::size_t num_knobs, + std::size_t num_objectives, + double conv_threshold, + int64_t cache_expire_dt_ms, + std::size_t max_iters) +: conv_threshold(conv_threshold), num_knobs(num_knobs), +num_objectives(num_objectives), +cache_expire_dt_ms(cache_expire_dt_ms), +final_explore(false), +max_iters(max_iters) +{ + scores = new double [num_knobs+1]; + centroid = new std::size_t [num_knobs]; + simplex = new std::size_t* [num_knobs+1]; + initial_config = new std::size_t* [num_knobs+1]; + + for (auto i=0ul; iweights[i] = weights[i]; + + this->score_function = score_function; + + set_constraints_now(constraint_min, constraint_max); + + iteration = 0; + if ( initial_config == nullptr ) { + std::set > fake; + + OUT_INFO( + std::cout << "[NMD|Info] Generating initial config for " << num_knobs << std::endl; + ) + + for (auto i=0ul; iinitial_config[i][j] = std::rand() % width + constraint_min[j]; + } + + generate_unique(this->initial_config[i], false, &fake); + auto new_key = std::vector(); + new_key.assign(this->initial_config[i], this->initial_config[i]+num_knobs); + fake.insert(new_key); + } + } else { + for (auto i=0ul; iinitial_config[i][j] = initial_config[i][j]; + } + + current_state = warmup; + warmup_step = 0; + + OUT_INFO( + for (auto i=0ul; iinitial_config[i][j] << " "; + std::cout << std::endl; + } + ) + + final_explore = false; + times_reentered_start = 0; +} + +void NmdGeneric::set_constraints_now(const std::size_t constraint_min[], + const std::size_t constraint_max[]) +{ + for (auto i=0ul; iconstraint_max[i] = constraint_max[i]; + this->constraint_min[i] = constraint_min[i]; + } +} + +void NmdGeneric::generate_unique(std::size_t initial[], bool accept_stale=false, + const std::set > *extra=nullptr) const +{ + const auto ts_now = std::chrono::time_point_cast(std::chrono::system_clock::now()).time_since_epoch().count(); + + auto explored = (std::size_t) std::count_if(cache.begin(), cache.end(), [ts_now, accept_stale](const auto &entry) { + auto dt = ts_now - entry.second.cache_ts; + return accept_stale || dt < entry.second.cache_dt; + }); + + auto max_comb = compute_max_combinations(); + + if ( max_comb > explored && max_comb - explored > 1 ) { + // VV: TODO Optimize check_novel(). Currently, large "max_distance" values + // may result in extreme overheads + const auto max_distance = 3ul; + int64_t temp[num_knobs]; + std::set< std::vector > candidates; + + auto check_novel = [this, &ts_now, &candidates, &accept_stale, &extra](int64_t knobs[]) mutable -> void { + apply_constraint(knobs); + + auto key = std::vector(); + + key.assign(knobs, knobs+num_knobs); + auto entry = cache.find(key); + if ( extra == nullptr || extra->find(key) == extra->end()) { + if ( entry == cache.end() ) { + candidates.insert(key); + } else { + std::cout << "Found "; + for (auto i=0ul; isecond.cache_ts; + if (accept_stale==false || + (dt >= entry->second.cache_dt && cache_expire_dt_ms > 0) ) { + candidates.insert(key); + } + } + } + }; + + auto counters = std::vector(num_knobs, 0ul); + + bool done = false; + + while ( done == false ) { + // VV: Generate all possible permutations + auto ops = std::string(num_knobs, '0'); + do{ + for ( auto j=0ul; j constraint_max[i] - constraint_min[i] +1) || + (counters[i] > max_distance) ) { + counters[i] = 0; + counters[i+1] += 1; + } + } + + if ( (counters[num_knobs-1] > + constraint_max[num_knobs-1] - constraint_min[num_knobs-1] +1) + || (counters[num_knobs-1] > max_distance)) + done = true; + } + + // std::cout << "Step " << candidates.size() << std::endl; + + std::vector< std::vector > sorted; + + sorted.assign(candidates.begin(), candidates.end()); + candidates.clear(); + + std::sort(sorted.begin(), sorted.end(), + [initial](const auto &e1, const auto &e2) mutable -> int { + int64_t t; + std::size_t d1=0ul, d2=0ul; + + for (auto i=0ul; i(); + const auto ts_now = std::chrono::time_point_cast(std::chrono::system_clock::now()).time_since_epoch().count(); + + for ( auto i = 0ul; isecond; + p.cache_ts = ts_now; + p.cache_dt = cache_expire_dt_ms; + entry->second = p; + } + + OUT_DEBUG( + std::cout << "CACHE ENTRIES: "<(); + key.assign(point_reflect, point_reflect + num_knobs); + + auto entry = cache.find(key); + + current_state = reflect; + + if ( entry != cache.end() + && times_reentered_start++ < 5 + && iteration < max_iters ) { + auto ts_now = std::chrono::time_point_cast(std::chrono::system_clock::now()).time_since_epoch().count(); + + if ( ts_now - entry->second.cache_ts < entry->second.cache_dt ) { + return do_reflect(entry->second.objectives.data(), entry->second.knobs.data()); + } + } + + return key; +} + +std::vector NmdGeneric::do_shrink() +{ + OUT_DEBUG( + std::cout << "[NMD|Dbg] INNER shrink" << std::endl; + ) + + std::set > fake; + std::vector key; + + for ( auto i=0ul; iinitial_config[i][j] << " "; + std::cout << std::endl; + } + ) + + return do_warmup({}, {}); +} + +std::vector NmdGeneric::do_contract_out(const double measurements[], + const std::size_t observed_knobs[]) +{ + ensure_profile_consistency(point_contract, observed_knobs); + score_contract = score(measurements); + OUT_DEBUG( + std::cout << "[NMD|Dbg] INNER ContractOUT: "; + for (auto i=0ul; i(std::chrono::system_clock::now()).time_since_epoch().count(); + + logistics entry; + entry.knobs.assign(observed_knobs, observed_knobs+num_knobs); + entry.objectives.assign(measurements, measurements+num_objectives); + entry.cache_dt = cache_expire_dt_ms; + entry.cache_ts = ts_now; + + cache[entry.knobs] = entry; + + if ( score_contract <= score_reflect ){ + // VV: foc <= fr then replace v[n] with voc + + for (auto i=0ul; i NmdGeneric::do_contract_in(const double measurements[], + const std::size_t observed_knobs[]) +{ + ensure_profile_consistency(point_contract, observed_knobs); + score_contract = score(measurements); + + OUT_DEBUG( + std::cout << "[NMD|Dbg] INNER ContractIN: "; + for (auto i=0ul; i(std::chrono::system_clock::now()).time_since_epoch().count(); + + logistics entry; + entry.knobs.assign(observed_knobs, observed_knobs+num_knobs); + entry.objectives.assign(measurements, measurements+num_objectives); + entry.cache_dt = cache_expire_dt_ms; + entry.cache_ts = ts_now; + + cache[entry.knobs] = entry; + + if ( score_contract < scores[num_knobs] ){ + // VV: fic < f[n] then replace v[n] with vic + + for (auto i=0ul; i NmdGeneric::do_expand(const double measurements[], + const std::size_t observed_knobs[]) +{ + ensure_profile_consistency(point_expand, observed_knobs); + score_expand = score(measurements); + + OUT_DEBUG( + std::cout << "[NMD|Dbg] INNER Expand: "; + for (auto i=0ul; i(std::chrono::system_clock::now()).time_since_epoch().count(); + + logistics entry; + entry.knobs.assign(observed_knobs, observed_knobs+num_knobs); + entry.objectives.assign(measurements, measurements+num_objectives); + entry.cache_dt = cache_expire_dt_ms; + entry.cache_ts = ts_now; + + cache[entry.knobs] = entry; + + if ( score_expand < score_reflect ){ + // VV: fe < fr then replace v[n] with ve + for (auto i=0ul; i NmdGeneric::do_reflect(const double measurements[], + const std::size_t observed_knobs[]) +{ + ensure_profile_consistency(point_reflect, observed_knobs); + score_reflect = score(measurements); + + OUT_DEBUG( + std::cout << "[NMD|Dbg] INNER Reflect: "; + for (auto i=0ul; i(std::chrono::system_clock::now()).time_since_epoch().count(); + + logistics entry; + entry.knobs.assign(observed_knobs, observed_knobs+num_knobs); + entry.objectives.assign(measurements, measurements+num_objectives); + entry.cache_dt = cache_expire_dt_ms; + entry.cache_ts = ts_now; + + cache[entry.knobs] = entry; + + if ( score_reflect >= scores[0] && score_reflect < scores[num_knobs-1]) { + // VV: fo <= fr < f[n-1] then replace v[n] with vr and start over + for ( auto i=0ul; i(); + key.assign(point_expand, point_expand+num_knobs); + auto e = cache.find(key); + + if ( e != cache.end() ) { + if ( ts_now - e->second.cache_ts < e->second.cache_dt ) { + return do_expand(e->second.objectives.data(), + e->second.knobs.data()); + } + } + + return key; + } else if (scores[num_knobs-1] <= score_reflect + && score_reflect < scores[num_knobs]) { + // VV: Reflect lies between f[n-1] and f[n] then contract (outside) + current_state = contract_out; + double temp[num_knobs]; + + for (auto i=0ul; i(); + key.assign(point_contract, point_contract+num_knobs); + auto e = cache.find(key); + + if ( e != cache.end() ) { + if ( ts_now - e->second.cache_ts < e->second.cache_dt ) { + return do_contract_out(e->second.objectives.data(), + e->second.knobs.data()); + } + } + + return key; + } else if (score_reflect >= scores[num_knobs]) { + // VV: Reflect > f[n] then contract (inside) + current_state = contract_in; + double temp[num_knobs]; + + for (auto i=0ul; i(); + key.assign(point_contract, point_contract+num_knobs); + auto e = cache.find(key); + + if ( e != cache.end() ) { + if ( ts_now - e->second.cache_ts < e->second.cache_dt ) { + return do_contract_in(e->second.objectives.data(), + e->second.knobs.data()); + } + } + + return key; + } + + OUT_INFO( + std::cout << "[NMD|Info] Should never get here" << std::endl; + ) + + current_state = start; + return do_start(true); +} + +std::vector NmdGeneric::do_warmup(const double measurements[], + const std::size_t observed_knobs[]) +{ + std::vector ret; + OUT_DEBUG( + std::cout << "[NMD|Dbg] INNER warmup" << std::endl; + ) + + if ( warmup_step > 0 ) { + auto last = warmup_step - 1; + ensure_profile_consistency(initial_config[last], observed_knobs); + memcpy(simplex[last], initial_config[last], sizeof(std::size_t)*num_knobs); + scores[last] = score(measurements); + auto key = std::vector(); + key.assign(observed_knobs, observed_knobs+num_knobs); + + logistics entry; + + entry.cache_dt = cache_expire_dt_ms; + entry.cache_ts = std::chrono::time_point_cast(std::chrono::system_clock::now()).time_since_epoch().count(); + entry.knobs.assign(observed_knobs, observed_knobs+num_knobs); + entry.objectives.assign(measurements, measurements+num_objectives); + + cache[key] = entry; + + OUT_DEBUG( + auto s = score(measurements); + std::cout << "[NMD|Dbg] Score: " << s << " for "; + for( auto i=0ul; i, bool> NmdGeneric::get_next(const double measurements[], + const std::size_t observed_knobs[]) +{ + std::vector ret; + #if defined(NMD_DEBUG_) || defined(NMD_INFO_) + const char *state_names[] = { + "warmup", + "start", + "reflect", + "expand", + "contract_in", + "contract_out", + "shrink" + }; + #endif + + OUT_DEBUG( + std::cout << "[NMD|Dbg] Current stage " << state_names[current_state] << std::endl; + ) + + switch (current_state) { + case warmup: + ret = do_warmup(measurements, observed_knobs); + break; + case start: + times_reentered_start = 0; + ret = do_start(true); + break; + case reflect: + ret = do_reflect(measurements, observed_knobs); + break; + case expand: + ret = do_expand(measurements, observed_knobs); + break; + case contract_in: + ret = do_contract_in(measurements, observed_knobs); + break; + case contract_out: + ret = do_contract_out(measurements, observed_knobs); + break; + case shrink: + ret = do_shrink(); + break; + default: + std::cout << "Unknown state!" << std::endl; + } + + OUT_INFO( + std::cout << "[NMD|Info] State " << state_names[current_state] << " proposes "; + + for (auto i=0ul; i= max_iters || sum <= conv_threshold ) { + // if ( final_explore == false ) { + // final_explore = true; + + // return false; + // } else { + // return true; + // } + OUT_INFO( + std::cout << "[NMD|Info] Converged at " << sum + << " threshold: " << conv_threshold << std::endl; + + std::cout << "[NMD|Info] Converged simplex" << std::endl; + for ( auto i=0ul; i +#include #include +#include + //#define NMD_DEBUG_ 1 -//#define NMD_INFO_ 1 -/* create the initial simplex +#ifdef NMD_DEBUG_ +#define OUT_DEBUG(X) X +#else +#define OUT_DEBUG(X) \ + { \ + } +#endif +namespace allscale +{ +namespace components +{ - vector NelderMead::explore_next_extra(double *extra, int level, + direction dir, + int level_max, int level_nested_max) { - // round to integer and bring again with allowable margins - // todo fix: generalize - if (x[0] < constraint_min[0] || x[0] > constraint_max[0]){ - x[0] = (constraint_min[0] + constraint_max[0])/2; - } - - if (x[1] < constraint_min[1] || x[1] > constraint_max[1]){ - x[1] = (constraint_min[1] + constraint_max[1])/2; - } - - x[0]=round(x[0]); - x[1]=round(x[1]); + /* + const char *to_string[] = { + "up", "up_final", "down", "left", "right", "right_final" + }; + */ + if ( extra[0] == 0.0 && extra[1] == 0.0 ) { + extra[1] = 1.0; + + return std::make_pair(level, dir); + } + switch (dir) { + case (direction::up): + if ( extra[1] < level ) { + extra[1] += 1.; + } else if( extra[0] < level_nested_max ) { + extra[0] += 1.; + dir = direction::right; + } else { + level ++; + } + break; + + case (direction::up_final): + if ( extra[1] < level ) { + extra[1] += 1.; + } else if( extra[0] < level_nested_max ) { + extra[0] += 1.; + dir = direction::right_final; + } else { + level ++; + } + break; + + + case (direction::down): + if ( extra[1] > -level ) { + extra[1] -= 1.0; + } else if ( extra[0] > -level_nested_max ){ + extra[0] -= 1.0; + dir = direction::left; + } + break; + + case (direction::left): + if ( extra[0] > -level_nested_max ) { + extra[0] -= 1.0; + } else if (extra[1] < level ) { + extra[1] += 1.0; + dir = direction::up_final; + } + break; + + case (direction::right): + if ( extra[0] < level_nested_max ) { + extra[0] += 1.; + } else if ( extra[1] <= level ) { + extra[1] -= 1.; + dir = direction::down; + } + break; + + case (direction::right_final): + if ( extra[0] < 0. ) { + extra[0] += 1.; + } else { + level ++; + extra[0] = 0.0; + extra[1] = level; + dir = direction::right; + } + break; + } + + return std::make_pair(level, dir); } -/* FIXME: generalize */ -void NelderMead::initialize_simplex(double params[][2], double values[], double constraint_min[],double constraint_max[]) +template +void NelderMead::generate_new(F &gen) { - int i,j; + double extra[] = {0, 0}; + double *new_set; + int i = 0; + int max_combinations = (constraint_max[0] - constraint_min[0]+1) + * (constraint_max[1] - constraint_min[1]+1); + int level = 1; + int max_nested_level = constraint_max[1] - constraint_min[1] +1; + int max_level = constraint_max[0] - constraint_min[0] +1; + direction dir = direction::right; + + // VV: Search for a twice as big space to take into account that + // new_set is not *actually* at 0, 0 + + max_level *= 2; + max_nested_level *=2; + auto timestamp_now = std::chrono::time_point_cast(std::chrono::system_clock::now()).time_since_epoch().count(); + + // VV: Restrict search-grid to a maximum block of 5x5 + int retries = 0; + const int retries_threshold = 5*5; + int is_same; + do + { + new_set = gen(extra); + + auto key = std::make_pair((int)new_set[0], (int)new_set[1]); + auto entry = cache_.find(key); + + is_same = 0; + + if ( entry != cache_.end() ) { + auto dt = timestamp_now - entry->second._cache_timestamp; + is_same = dt <= entry->second._cache_expires_dt; + } + + ++ retries; + if ( ( level < max_level +1) + && is_same + && max_combinations > (NMD_NUM_KNOBS + 1) + && retries < retries_threshold ) + { + # if 0 + extra[0] = rand() % (int)(constraint_max[0] - constraint_min[0]) + + (int)constraint_min[0] + - (int)(0.5 * (constraint_max[0] - constraint_min[0])); + + extra[1] = rand() % (int)(constraint_max[1] - constraint_min[1]) + + (int)constraint_min[1] + - (int)(0.5 * (constraint_max[1] - constraint_min[1])); + #else + auto logistics = explore_next_extra(extra, level, dir, + max_level, max_nested_level); + level = logistics.first; + dir = logistics.second; + + #endif + /* + OUT_DEBUG( + std::cout << "[NelderMead|Debug] Rejecting " + << new_set[0] << " " << new_set[1] + << " will try offset " << extra[0] << " " << extra[1] << std::endl; + ) + */ + } else { + break; + } + } while ( 1 ); + + if ( retries >= retries_threshold ) { + extra[0] = 0; + extra[1] = 0; - for (i=0;i<=n;i++) { - for (j=0;jconstraint_min[i]=constraint_min[i]; - this->constraint_max[i]=constraint_max[i]; - } - itr=0; } +void NelderMead::my_constraints(double x[]) +{ + for (auto i = 0u; i < 2u; ++i) + { + if (x[i] < constraint_min[i]) + x[i] = constraint_min[i]; + else if (x[i] > constraint_max[i]) + x[i] = constraint_max[i]; + } + + x[0] = round(x[0]); + x[1] = round(x[1]); +} -/* print out the initial values */ -void NelderMead::print_initial_simplex() +bool NelderMead::cache_update(int threads, int freq_idx, + const double objectives[], bool add_if_new) { - int i,j; - std::cout << "[NelderMead DEBUG] Initial Values\n"; - for (j=0;j<=n;j++) { - for (i=0;iscale[i] = scale[i]; + + reevaluate_scores(); +} -/* find the index of the smallest value */ -int NelderMead::vs_index() +double NelderMead::evaluate_score(const double objectives[], const double *weights) { - int j; - int vs=0; + double score; + // VV: [time, energy/power, resources] + + if (weights == nullptr) + weights = opt_weights; + + #if 0 + score = 0.0; + for (auto i = 0; i < NMD_NUM_OBJECTIVES; ++i) + { + double t = objectives[i] / scale[i]; + score += t * t * weights[i]; + } + #else + score = 1.0; + for ( auto i=0; i(std::chrono::system_clock::now()).time_since_epoch().count(); + + for (i = 0; i < NMD_NUM_KNOBS; i++) + { + this->constraint_min[i] = constraint_min[i]; + this->constraint_max[i] = constraint_max[i]; + } + + OUT_DEBUG( + std::cout << "[NelderMead|Debug] Initialize contraints " << std::endl; + std::cout << constraint_min[0] + << ":" << constraint_max[0] << std::endl; + std::cout << constraint_min[1] + << ":" << constraint_max[1] << std::endl; + ) + + set_weights(weights); + state_ = warmup; + itr = 0; + warming_up_step = 0; + convergence_reevaluating = false; + cache_.clear(); + + for (i=0; i(std::chrono::system_clock::now()).time_since_epoch().count(); + + for (i=0; iconstraint_min[i] = constraint_min[i]; + this->constraint_max[i] = constraint_max[i]; + } + + OUT_DEBUG( + std::cout << "[NelderMead|Debug] Initialize contraints " << std::endl; + std::cout << constraint_min[0] + << ":" << constraint_max[0] << std::endl; + std::cout << constraint_min[1] + << ":" << constraint_max[1] << std::endl; + ) + + set_weights(weights); + state_ = warmup; + itr = 0; + warming_up_step = 0; + convergence_reevaluating = false; + cache_.clear(); + if (initial_simplex == nullptr) + { + #if 0 + int threads_low = round(0.25 * (constraint_max[0] - constraint_min[1]) + + constraint_min[1]); + int threads_med = round(0.5 * (constraint_max[0] - constraint_min[1]) + + constraint_min[1]); + int threads_high = constraint_max[0] * 0.75; + + initial_configurations[0][0] = threads_low; + initial_configurations[0][1] = (int)constraint_min[1]; + + initial_configurations[1][0] = threads_high; + initial_configurations[1][1] = (int)constraint_min[1]; + + initial_configurations[2][0] = threads_high; + initial_configurations[2][1] = (int)constraint_max[1]; + #else + for (i=0; i f[vh] && f[j] < f[vg]) { - vh = j; + auto e = cache_.find(std::make_pair(threads, freq_idx)); + std::cout << " Objective value = "<< std::flush << f[j] << std::flush; + + if ( e == cache_.end() ) + { + std::cout << " (not in cache)" << std::flush << std::endl; + } else { + std::cout << " OBJs: " << std::flush + << e->second.objectives[0] << " " + << e->second.objectives[1] << " " + << e->second.objectives[2] << " " + << std::endl; + } + std::cout << std::flush; } - } - return vh; } +/* print out the value at each iteration */ +void NelderMead::print_iteration() +{ + int i, j; + std::cout << "[NelderMead DEBUG] Iteration " << itr << std::endl; + //printf("Iteration %d\n",itr); + for (j = 0; j <= n; j++) + { + std::cout << "[NelderMead DEBUG] Vertex-" << j + 1 << "=("; + for (i = 0; i < n; i++) + { + //printf("%f %f\n\n",v[j][i],f[j]); + std::cout << v[j][i]; + if (i < n - 1) + std::cout << ","; + } + std::cout << ")=" << f[j] << std::endl; + } + + std::cout << "[NelderMead DEBUG] Current Objective Minimum is at: " << f[vs] << std::endl; + std::cout << "[NelderMead DEBUG] f[vs]= " << f[vs] << ", vs = " << vs << std::endl; + std::cout << "[NelderMead DEBUG] f[vh]= " << f[vh] << ", vh = " << vh << std::endl; + std::cout << "[NelderMead DEBUG] f[vg]= " << f[vg] << ", vg = " << vg << std::endl; +} /* calculate the centroid */ void NelderMead::centroid() { - int j,m; - double cent; - - for (j=0;j<=n-1;j++) { - cent=0.0; - for (m=0;m<=n;m++) { - if (m!=vg) { - cent += v[m][j]; - } - } - vm[j] = cent/n; - } + int j, m; + double cent; + + for (j = 0; j < NMD_NUM_KNOBS; j++) + { + cent = 0.0; + for (m = 0; m < NMD_NUM_KNOBS +1; m++) + { + if (m != vg) + { + cent += v[m][j]; + } + } + vm[j] = cent / n; + } + + my_constraints(vm); + + OUT_DEBUG ( + std::cout << "[NelderMead|DEBUG] New Centroid: " + << vm[0] << " " << vm[1] << std::endl; + ) } -optstepresult NelderMead::step(double param) +void NelderMead::sort_vertices() { - optstepresult res; - res.threads=0; - res.freq_idx=-1; - switch (state_){ + // VV: -1 is used for padding because the index to this map will never evaluate to 0 + int map_to_index[] = { + 0, 0, 1, 0, 2, 0, 0, 0}; - /** ITERATION START **/ - case start: - itr++; -#ifdef NMD_DEBUG_ - std::cout << "[NelderMead DEBUG] State = Start" << std::endl; - print_initial_simplex(); -#endif - // todo: implement here the simplex initialization, currently this is - // done in the constructor - - /* find the index of the largest value (W) */ - vg = vg_index(); - - /* find the index of the smallest value (B) */ - vs = vs_index(); - - /* find the index of the second largest value (G) */ - vh = vh_index(); - - /* calculate the centroid */ - centroid(); - - /* reflect vg to new vertex vr */ - for (j=0;j<=n-1;j++) { - /*vr[j] = (1+ALPHA)*vm[j] - ALPHA*v[vg][j];*/ - /* - std::cout << "vm[" << j << "]=" << vm[j] << std::endl; - std::cout << "v[vg" << j << "]=" << v[vg][j] << std::endl; - std::cout << "ALPHA=" << ALPHA << std::endl; - */ - vr[j] = vm[j]+ALPHA*(vm[j]-v[vg][j]); - } - my_constraints(vr); -#ifdef NMD_DEBUG_ - std::cout << "[NelderMead DEBUG] Reflection Parameter = (" - << vr[0] << "," << vr[1] << ")" - << std::endl; -#endif - // enter reflection state - state_=reflection; - res.threads=vr[0]; - res.freq_idx=vr[1]; + vg = vs = vh = 0; - break; + // VV: Compute greatest, smallest, and half-point + for (i = 0; i <= n; ++i) + { + vg = f[i] > f[vg] ? i : vg; + vs = f[i] < f[vs] ? i : vs; + } - /** REFLECTION **/ + // VV: Find out what's the half-point by using a bitmap, + // when vg==vs that means that all points are equal + if (vg != vs) + { + vh = 1 + 2 + 4 - (1 << vg) - (1 << vs); + vh = map_to_index[vh]; + } + else + { + vg = 2; + vh = 1; + vs = 0; + } +} + +optstepresult NelderMead::do_step_start() +{ + optstepresult res; + times_used_cached ++; + + OUT_DEBUG( + std::cout << "[NelderMead DEBUG] State = Start" << std::endl; + print_initial_simplex(); + ) + + sort_vertices(); + + centroid(); + + // VV: Try not to pick a knob_set that already exists in `v` + auto gen_new = [this](double *extra) mutable -> double* { + + for (j = 0; j < NMD_NUM_KNOBS; j++) + vr[j] = vm[j] + ALPHA * (vm[j] - v[vg][j]) - extra[j]; + + my_constraints(vr); + + return vr; + }; + + generate_new(gen_new); - /** This state is entered when we have received a sample of the objective - ** function at the reflection vertex - **/ - case reflection: -#ifdef NMD_DEBUG_ - std::cout << "[NelderMead DEBUG] State = Reflection" << std::endl; -#endif - fr=param; - //fr = objfunc(vr); - - if (fr < f[vh]){ // f(R) < f(G) - Case (i) - if (fr >= f[vs]) { // f(R)>f(B) - for (j=0;j<=n-1;j++) { // replace W with R and end iteration - v[vg][j] = vr[j]; - } - f[vg] = fr; - updateObjectives(); - state_=start; - break; - } - - /* investigate a step further through expansion in this direction */ - else{ - for (j=0;j<=n-1;j++) { - /*ve[j] = GAMMA*vr[j] + (1-GAMMA)*vm[j];*/ - ve[j] = vm[j]+GAMMA*(vr[j]-vm[j]); - } #ifdef NMD_DEBUG_ - std::cout << "[NelderMead DEBUG] Expansion Parameter = (" - << ve[0] << "," << ve[1] << ")" - << std::endl; + std::cout << "[NelderMead DEBUG] Reflection Parameter = (" + << vr[0] << "," << vr[1] << ")" + << std::endl; #endif - my_constraints(ve); - // enter the state waiting for a sampled value of the objective function - // at the expansion vertex - state_=expansion; - res.threads=ve[0]; - res.freq_idx=ve[1]; - - break; - } - - }else{ // f(R) > f(G) - Case (ii) - if (fr < f[vg]) { // f(R) < f(W) - for (j=0;j<=n-1;j++) { // replace W with R - v[vg][j] = vr[j]; - } - f[vg] = fr; - } - - if (fr < f[vg] && fr >= f[vh]) { - /* perform outside contraction */ - for (j=0;j<=n-1;j++) { - /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j];*/ - vc[j] = vm[j]+BETA*(vr[j]-vm[j]); - } + // enter reflection state + state_ = reflection; + res.threads = vr[0]; + res.freq_idx = vr[1]; + + auto key = std::make_pair(res.threads, res.freq_idx); + + auto entry = cache_.find(key); + + //VV: Fixme, remove recursion due to cache + if (entry != cache_.end() && times_used_cached < 15) + { + auto timestamp_now = std::chrono::time_point_cast(std::chrono::system_clock::now()).time_since_epoch().count(); + auto dt = timestamp_now - entry->second._cache_timestamp; + + if (dt < entry->second._cache_expires_dt) + { + return do_step_reflect(entry->second.objectives, + entry->second.threads, + entry->second.freq_idx); + } + } + + return res; +} + +optstepresult NelderMead::do_step_reflect(const double objectives[], + double knob1, double knob2) +{ + optstepresult res; #ifdef NMD_DEBUG_ - std::cout << "[NelderMead DEBUG] Contraction Parameter = (" - << vc[0] << "," << vc[1] << ")" - << std::endl; + std::cout << "[NelderMead DEBUG] State = Reflection" << std::endl; #endif - my_constraints(vc); - // enter the state waiting for a sampled value of the objective function - // at the outside contraction vertex - state_=contraction; - res.threads=vc[0]; - res.freq_idx=vc[1]; - break; - } else { - /* perform inside contraction */ - for (j=0;j<=n-1;j++) { - /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j];*/ - vc[j] = vm[j]-BETA*(vm[j]-v[vg][j]); - } + // VV: Make sure that we actually profiled what we meant to + double profiled[] = {knob1, knob2}; + my_constraints(profiled); + + if ( vr[0] != profiled[0] || vr[1] != profiled[1] ) { + std::cout << "[NelderMead|WARN] Meant to profile " << vr[0] << " knob1 " + "but ended up using " << profiled[0] << std::endl; + std::cout << "[NelderMead|WARN] Meant to profile " << vr[1] << " knob2 " + "but ended up using " << profiled[1] << std::endl; + + auto key = std::make_pair((int)vr[0], (int)vr[1]); + auto iter = cache_.find(key); + if ( iter != cache_.end() ) { + iter->second.threads = profiled[0]; + iter->second.freq_idx = profiled[1]; + } + + vr[0] = profiled[0]; + vr[1] = profiled[1]; + + cache_update((int)vr[0], (int)vr[1], objectives, true); + } + + fr = evaluate_score(objectives, opt_weights); + + if ((f[vs] <= fr) && (fr < f[vh])) + { + // VV: REFLECTED point is better than the SECOND BEST + // but NOT better than the BEST + // Replace WORST point with REFLECTED + for (j = 0; j <= n - 1; j++) + { + v[vg][j] = vr[j]; + } + + my_constraints(v[vg]); + + f[vg] = fr; + + const int threads = (int)(v[vg][0]); + const int freq_idx = (int)(v[vg][1]); + + cache_update(threads, freq_idx, objectives, true); + + state_ = start; + return do_step_start(); + } + else if (fr < f[vs]) + { + // VV: REFLECTED is better than BEST + auto gen_new = [this](double *extra) mutable -> double* { + for (j = 0; j < NMD_NUM_KNOBS; j++) + ve[j] = vm[j] + GAMMA * (vr[j] - vm[j]) - extra[j]; + + my_constraints(ve); + + return ve; + }; + + generate_new(gen_new); + + // VV: Now evaluate EXPANDED + res.threads = ve[0]; + res.freq_idx = ve[1]; + + state_ = expansion; + + auto key = std::make_pair(res.threads, res.freq_idx); + + auto entry = cache_.find(key); + + if (entry != cache_.end()) + { + auto timestamp_now = std::chrono::time_point_cast(std::chrono::system_clock::now()).time_since_epoch().count(); + auto dt = timestamp_now - entry->second._cache_timestamp; + + if (dt < entry->second._cache_expires_dt) + { + return do_step_expand(entry->second.objectives, + entry->second.threads, + entry->second.freq_idx); + } + } + + return res; + } + else if ((f[vh] <= fr) && (fr < f[vg])) + { + // VV: REFLECTED between SECOND BEST and WORST + auto gen_new = [this](double *extra) mutable -> double* { + for (j = 0; j < NMD_NUM_KNOBS; j++) + vc[j] = vm[j] + BETA * (vr[j] - vm[j]) - extra[j]; + + my_constraints(vc); + + return vc; + }; + + generate_new(gen_new); + + // VV: Now evaluate EXPANDED + res.threads = vc[0]; + res.freq_idx = vc[1]; + + state_ = contraction_out; + + auto key = std::make_pair(res.threads, res.freq_idx); + + auto entry = cache_.find(key); + + if (entry != cache_.end()) + { + auto timestamp_now = std::chrono::time_point_cast(std::chrono::system_clock::now()).time_since_epoch().count(); + auto dt = timestamp_now - entry->second._cache_timestamp; + + if (dt < entry->second._cache_expires_dt) + { + return do_step_contract_out(entry->second.objectives, + entry->second.threads, + entry->second.freq_idx); + } + } + + return res; + } + else + { + // VV: REFLECTED worse than WORST + auto gen_new = [this](double *extra) mutable -> double* { + for (j = 0; j < NMD_NUM_KNOBS; j++) + vc[j] = vm[j] - BETA * (vr[j] - vm[j]) - extra[j]; + + my_constraints(vc); + + return vc; + }; + + generate_new(gen_new); + + // VV: Now evaluate EXPANDED + res.threads = vc[0]; + res.freq_idx = vc[1]; + + state_ = contraction_in; + auto key = std::make_pair(res.threads, res.freq_idx); + + auto entry = cache_.find(key); + + if (entry != cache_.end()) + { + auto timestamp_now = std::chrono::time_point_cast(std::chrono::system_clock::now()).time_since_epoch().count(); + auto dt = timestamp_now - entry->second._cache_timestamp; + + if (dt < entry->second._cache_expires_dt) + { + return do_step_contract_in(entry->second.objectives, + entry->second.threads, + entry->second.freq_idx); + } + } + + return res; + } +} + +optstepresult NelderMead::do_step_expand(const double objectives[], + double knob1, double knob2) +{ #ifdef NMD_DEBUG_ - std::cout << "[NelderMead DEBUG] Contraction Parameter = (" - << vc[0] << "," << vc[1] << ")" - << std::endl; + std::cout << "[NelderMead DEBUG] State = Expansion" << std::endl; #endif - my_constraints(vc); - state_=contraction; - res.threads=vc[0]; - res.freq_idx=vc[1]; - break; + fe = evaluate_score(objectives, nullptr); + + double profiled[] = {knob1, knob2}; + my_constraints(profiled); + + if ( ve[0] != profiled[0] || ve[1] != profiled[1] ) { + std::cout << "[NelderMead|WARN] Meant to profile expand " << ve[0] << " knob1 " + "but ended up using " << profiled[0] << std::endl; + std::cout << "[NelderMead|WARN] Meant to profile expand " << ve[1] << " knob2 " + "but ended up using " << profiled[1] << std::endl; + + auto key = std::make_pair((int)ve[0], (int)ve[1]); + auto iter = cache_.find(key); + if ( iter != cache_.end() ) { + iter->second.threads = profiled[0]; + iter->second.freq_idx = profiled[1]; } + ve[0] = profiled[0]; + ve[1] = profiled[1]; - /** EXPANSION **/ + cache_update((int)ve[0], (int)ve[1], objectives, true); + } - /** This state is entered when we have received a sample of the objective - ** function at the expansion vertex - **/ - case expansion: + if (fe < fr) + { + // VV: EXPANDED point is better than REFLECTIVE + // Replace WORST with EXPANDED + for (j = 0; j <= n - 1; j++) + { + v[vg][j] = ve[j]; + } + f[vg] = fe; + } + else + { + // VV: Replace WORST with REFLECTED + for (j = 0; j <= n - 1; j++) + { + v[vg][j] = vr[j]; + } + f[vg] = fr; + } + + state_ = start; + const int threads = (int)(v[vg][0]); + const int freq_idx = (int)(v[vg][1]); + + cache_update(threads, freq_idx, objectives, true); + return do_step_start(); +} + +optstepresult NelderMead::do_step_contract_in(const double objectives[], + double knob1, double knob2) +{ + int j; #ifdef NMD_DEBUG_ - std::cout << "[NelderMead DEBUG] State = Expansion" << std::endl; + std::cout << "[NelderMead|DEBUG] State = ContractionIN" << std::endl; #endif - fe=param; - //fe = objfunc(ve); - if (fe < f[vs]) { // if f(E)second.threads = profiled[0]; + iter->second.freq_idx = profiled[1]; + } + + vc[0] = profiled[0]; + vc[1] = profiled[1]; + + cache_update((int)vc[0], (int)vc[1], objectives, true); + } + + if (fc <= f[NMD_NUM_KNOBS]) + { + // VV: CONTRACTED_I is better than WORST + // Replace WORST with CONTRACTED_I + for (j = 0; j < NMD_NUM_KNOBS; j++) + { + v[vg][j] = vc[j]; + } + f[vg] = fc; + + const int threads = (int)(v[vg][0]); + const int freq_idx = (int)(v[vg][1]); + + cache_update(threads, freq_idx, objectives, true); + return do_step_start(); + } + else + { + state_ = shrink; + return do_step_shrink(); + } +} + +optstepresult NelderMead::do_step_contract_out(const double objectives[], + double knob1, double knob2) +{ + int j; #ifdef NMD_DEBUG_ - std::cout << "[NelderMead|DEBUG] State = Contraction" << std::endl; + std::cout << "[NelderMead|DEBUG] State = ContractionOUT" << std::endl; #endif - fc=param; - //fc = objfunc(vc); - if (fc < f[vg]) { // f(C)second.threads = profiled[0]; + iter->second.freq_idx = profiled[1]; + } + + vc[0] = profiled[0]; + vc[1] = profiled[1]; + + cache_update((int)vc[0], (int)vc[1], objectives, true); + } + + if (fc <= fr) + { + // VV: CONTRACTED_O is better than REFLECTED + // Replace WORST with CONTRACTED_O + for (j = 0; j < NMD_NUM_KNOBS; j++) + { + v[vg][j] = vc[j]; + } + f[vg] = fc; + + const int threads = (int)(v[vg][0]); + const int freq_idx = (int)(v[vg][1]); + + cache_update(threads, freq_idx, objectives, true); + return do_step_start(); + } + else + { + state_ = shrink; + return do_step_shrink(); + } +} + +optstepresult NelderMead::do_step_shrink() +{ #ifdef NMD_DEBUG_ - print_iteration(); + std::cout << "[NelderMead|DEBUG] State = Shrink" << std::endl; #endif - res.converged=testConvergence(); - return res; + for (auto i=0ul; i double* { + for (j = 0; j < NMD_NUM_KNOBS; j++) + vr[j] = vm[j] + DELTA * (v[i][j] - vm[j]) - extra[j]; + + my_constraints(vr); + + return vr; + }; + + generate_new(gen_new); + } + + state_ = warmup; + warming_up_step = 0; + return do_step_warmup({}, 0, 0); } -bool NelderMead::testConvergence(){ - - fsum = 0.0; - for (j=0;j<=n;j++) { - fsum += f[j]; - } - favg = fsum/(n+1); - s = 0.0; - for (j=0;j<=n;j++) { - s += pow((f[j]-favg),2.0)/(n); - } - s = sqrt(s); - s = s /favg; // normalization step -#ifdef NMD_INFO_ - std::cout << "[NelderMead|INFO] Convergence Ratio is " << s << std::endl; - std::cout << "[NelderMead|INFO] Convergence Threshold set is " << EPSILON << std::endl; -#endif - if (s >= EPSILON && itr <= MAXITERATIONS) - return false; - else{ - vs = vs_index(); - min=f[vs]; - return true; - } +optstepresult NelderMead::do_step_warmup(const double objectives[], + double knob1, double knob2) +{ + #ifdef NMD_DEBUG_ + std::cout << "[NelderMead|DEBUG] State = Warmup " + << warming_up_step << std::endl; + #endif + + OUT_DEBUG( + if ( warming_up_step == 0 ) { + std::cout << "[NelderMead|DEBUG] Initial exploration" << std::endl; + + for ( auto i =0; i 0 && warming_up_step <= NMD_NUM_KNOBS + 1) { + double profiled[] = {knob1, knob2}; + my_constraints(profiled); + + if ( v[warming_up_step-1][0] != profiled[0] || v[warming_up_step-1][1] != profiled[1] ) { + std::cout << "[NelderMead|WARN] Meant to profile expand " << v[warming_up_step-1][0] << " knob1 " + "but ended up using " << profiled[0] << std::endl; + std::cout << "[NelderMead|WARN] Meant to profile expand " << v[warming_up_step-1][1] << " knob2 " + "but ended up using " << profiled[1] << std::endl; + + auto key = std::make_pair((int)v[warming_up_step-1][0], (int)v[warming_up_step-1][1]); + auto iter = cache_.find(key); + if ( iter != cache_.end() ) { + iter->second.threads = profiled[0]; + iter->second.freq_idx = profiled[1]; + } + + v[warming_up_step-1][0] = profiled[0]; + v[warming_up_step-1][1] = profiled[1]; + } + + // VV: Record results of last warming up step + f[warming_up_step-1] = evaluate_score(objectives, nullptr); + cache_update(v[warming_up_step-1][0], v[warming_up_step-1][1], + objectives, true); + } + + if ( warming_up_step == NMD_NUM_KNOBS + 1) { + // VV: We need not explore the knob_set space anymore + state_ = start; + return step(objectives, knob1, knob2); + } else if (warming_up_step > NMD_NUM_KNOBS + 1) { + std::cout << "[NelderMead|Warn] Unknown warmup step " << warming_up_step << std::endl; + } + optstepresult res; + + res.objectives[0] = -1; + res.objectives[1] = -1; + res.objectives[2] = -1; + res.converged = false; + + res.threads = initial_configurations[warming_up_step][0]; + res.freq_idx = initial_configurations[warming_up_step][1]; + + v[warming_up_step][0] = res.threads; + v[warming_up_step][1] = res.freq_idx; + warming_up_step++; + + return res; } -void NelderMead::updateObjectives(){ - /* re-evaluate all the vertices */ - /*for (j=0;j<=n;j++) { - f[j] = objfunc(v[j]); - } - */ +optstepresult NelderMead::step(const double objectives[], + double knob1, double knob2) +{ + int i, j; + + optstepresult res; + res.threads = 0; + res.freq_idx = -1; + times_used_cached = 0; + + OUT_DEBUG( + auto score = evaluate_score(objectives, nullptr); + + std::cout << "[NelderMead|DEBUG] Starting step with " + << objectives[0] << " " + << objectives[1] << " " + << objectives[2] << " score " << score << std::endl; + ) + + if ( should_update_constraints ) { + for (i=0; isecond.objectives, nullptr); + } + } + #endif + + if ( should_invalidate_cache ) + do_invalidate_cache(); + + if ( should_reevaluate_scores ) + do_reevaluate_scores(); + + switch (state_) + { + case warmup: + { + res = do_step_warmup(objectives, knob1, knob2); + break; + } + break; + case start: + itr++; + res = do_step_start(); + break; + case reflection: + res = do_step_reflect(objectives, knob1, knob2); + break; + case expansion: + res = do_step_expand(objectives, knob1, knob2); + break; + case contraction_in: + res = do_step_contract_in(objectives, knob1, knob2); + break; + case contraction_out: + res = do_step_contract_out(objectives, knob1, knob2); + break; + default: + std::cout << "Unknown NelderMead state (" << state_ << ")" << std::endl; + res.converged = false; + return res; + } - /* find the index of the second largest value */ - vh = vh_index(); + if ( state_ != warmup ) + { + res.converged = testConvergence(tested_combinations); + + if (res.converged == true) + { + res.threads = v[vs][0]; + res.freq_idx = v[vs][1]; + OUT_DEBUG( + std::cout << "[NelderMead|DEBUG] Converged to " << res.threads << " " << res.freq_idx << std::endl; + ) + } + } + + if ( res.threads > constraint_max[0]) + res.threads = (int) constraint_max[0]; + else if ( res.threads < constraint_min[0]) + res.threads = (int) constraint_min[0]; + + if ( res.freq_idx > constraint_max[1]) + res.freq_idx = (int) constraint_max[1]; + else if ( res.freq_idx < constraint_min[1]) + res.freq_idx = (int) constraint_min[1]; + + std::cout << "Stop step with " + << objectives[0] << " " + << objectives[1] << " " + << objectives[2] << std::endl; + + return res; +} - my_constraints(v[vg]); +bool NelderMead::testConvergence(std::size_t tested_combinations) +{ + double temp; + #if 0 + int all_same = 1; + + for (auto i = 0; i <= n; ++i) + { + for (auto k = i + 1; j <= n; ++k) + for (auto j = 0; j < n; ++j) + all_same &= (v[i][j] == v[k][j]); + } - //f[vg] = objfunc(v[vg]); + if (all_same) + { + min = f[vs]; + return true; + } + #endif + bool ret = false; - my_constraints(v[vh]); + fsum = 0.0; + for (auto j = 0; j <= n; j++) + { + fsum += f[j]; + } + favg = fsum / (n + 1); + s = 0.0; + for (auto j = 0; j <= n; j++) + { + temp = (f[j] - favg); + s += temp * temp / (n); + } + s = sqrt(s); + s = s / favg; // normalization step +#ifdef NMD_INFO_ + std::cout << "[NelderMead|INFO] Convergence Ratio is " << s << std::endl; + std::cout << "[NelderMead|INFO] Convergence Threshold set is " << EPSILON << std::endl; +#endif + int max_combinations = (constraint_max[0] - constraint_min[0]+1) + * (constraint_max[1] - constraint_min[1]+1); + + if ( (s >= EPSILON) + && (itr <= MAXITERATIONS) + && (max_combinations != tested_combinations) ) + ret = false; + else + { + sort_vertices(); + min = f[vs]; + + OUT_DEBUG( + std::cout << "[NelderMead|Debug] Cache_ Max: " << max_combinations + << " explored " << tested_combinations << std::endl; + for (const auto &entry: cache_ ) { + std::cout << "[NelderMead|Debug] Cache_ " + << entry.second.threads << " " + << entry.second.freq_idx << " :: " + << entry.second.objectives[0] << " " + << entry.second.objectives[1] << " " + << entry.second.objectives[2] << " :: " + << evaluate_score(entry.second.objectives, nullptr) << std::endl; + } + ) + + ret = true; + } - //f[vh] = objfunc(v[vh]); -} + if ( ret == true && convergence_reevaluating == true ) { + // VV: Now find the best result from cache + sort_vertices(); -} -} -/* + double best_knobs[NMD_NUM_KNOBS] = { v[vs][0], v[vs][1]}; + double best_score = f[vs]; -std::vector NelderMead::minimum(){ + for ( const auto & entry: cache_ ) { + auto cur_score = evaluate_score(entry.second.objectives, nullptr); + if ( cur_score < best_score) { + best_knobs[0] = entry.second.threads; + best_knobs[1] = entry.second.freq_idx; + best_score = cur_score; + } + } - free(f); - free(vr); - free(ve); - free(vc); - free(vm); - for (i=0;i<=n;i++) { - free (v[i]); - } - free(v); - return min; + v[vs][0] = best_knobs[0]; + v[vs][1] = best_knobs[1]; + f[vs] = best_score; + return true; + } else if ( ret == true ) { + // VV: Do another final run to make sure that the objective scores still hold up + OUT_DEBUG ( + std::cout << "[NelderMead|Debug] Doing another final search" << std::endl; + ) + state_ = warmup; + warming_up_step = 0; + itr --; + convergence_reevaluating = true; + std::vector fresh; + + for ( const auto &entry: cache_ ) { + fresh.push_back(entry.second); + } + cache_.clear(); -} -*/ + std::sort(fresh.begin(), fresh.end(), + [this](const optstepresult &l, const optstepresult &r) mutable -> int { + return evaluate_score(l.objectives, nullptr) < + evaluate_score(r.objectives, nullptr); + }); + for (auto i=0ul; i objectives_priorities; - int objectives_priority_idx=0; - std::size_t num_localities = allscale::get_num_localities(); std::unique_lock l(resize_mtx_); hpx::util::ignore_while_checking> il(&l); + if (initialized_) return; #ifdef MEASURE_ - update_active_osthreads(0); -#ifdef ALLSCALE_HAVE_CPUFREQ - update_power_consumption(hardware_reconf::read_system_power()); -#endif + last_measure_power = std::chrono::time_point_cast(std::chrono::system_clock::now()).time_since_epoch().count(); + last_measure_threads = last_measure_power; #endif rp_ = &hpx::resource::get_partitioner(); @@ -223,8 +212,6 @@ void scheduler::init() { ) ); -// std::cout << "init: " << num_cores << " " << allscale::get_num_localities() << " " << depth_cut_off_ << '\n'; - // Reading user provided options in terms of desired optimization objectives std::string input_objective_str = hpx::get_config_entry("allscale.objective", ""); @@ -232,17 +219,29 @@ void scheduler::init() { /* Read optimization policy selected by the user. If not specified, allscale policy is the default */ std::string input_optpolicy_str = - hpx::get_config_entry("allscale.policy", "allscale"); + hpx::get_config_entry("allscale.policy", "none"); + if ( input_optpolicy_str == "none" ){ + char *c_optpolicy = std::getenv("ALLSCALE_LOCAL_OPTIMIZER"); + if ( c_optpolicy) + input_optpolicy_str = std::string(c_optpolicy); + } + + + uselopt=false; #ifdef DEBUG_MULTIOBJECTIVE_ std::cout << "[Local Optimizer|INFO] Optimization Policy Active = " << input_optpolicy_str << std::endl; #endif - if (input_optpolicy_str=="allscale") - lopt_.setPolicy(allscale); - else if (input_optpolicy_str=="random") + if (input_optpolicy_str=="allscale") + lopt_.setPolicy(allscale); + else if (input_optpolicy_str=="random") lopt_.setPolicy(random); - else if (input_optpolicy_str=="manual") + else if (input_optpolicy_str=="manual") lopt_.setPolicy(manual); - else lopt_.setPolicy(allscale); + else if ( input_optpolicy_str != "none" ) { + HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init", + "unknown allscale.policy"); + } + #ifdef MEASURE_MANUAL_ std::string input_osthreads_str = @@ -265,6 +264,12 @@ void scheduler::init() { } #endif + if (input_objective_str.empty() ){ + char *c_opt_objective = std::getenv("ALLSCALE_LOCAL_OBJECTIVE"); + if ( c_opt_objective ) + input_objective_str = std::string(c_opt_objective); + } + if (!input_objective_str.empty()) { uselopt=true; std::istringstream iss_leeways(input_objective_str); @@ -276,95 +281,54 @@ void scheduler::init() { #ifdef DEBUG_INIT_ std::cout << "Scheduling Objective provided: " << obj << "\n"; #endif - // Don't scale objectives if none is given - double leeway = 1.0; + // VV: Don't scale objectives if none is given + double opt_weight = 1.0; if (idx != std::string::npos) { #ifdef DEBUG_INIT_ - std::cout << "Found a leeway, triggering multi-objectives policies\n" - << std::flush; + std::cout << "Found an optimization weight, triggering " + "multi-objectives policies\n" << std::flush; #endif multi_objectives = true; obj = objective_str.substr(0, idx); - leeway = std::stod(objective_str.substr(idx + 1)); + opt_weight = std::stod(objective_str.substr(idx + 1)); } if (obj == "time") { time_requested = true; - objectives_priorities.push_back(time); -#ifdef DEBUG_INIT_ - std::cout << "Priority[" << objectives_priority_idx << "]=" << objectives_priorities[objectives_priority_idx] - << std::endl; -#endif - time_leeway = leeway; + time_weight = opt_weight; #ifdef DEBUG_INIT_ - std::cout << "Set time margin to " << time_leeway << "\n" << std::flush; + std::cout << "Set time weight to " << time_weight << "\n" << std::flush; #endif - } else if (obj == "resource") { - resource_requested = true; - objectives_priorities.push_back(resource); -#ifdef DEBUG_INIT_ - std::cout << "Priority[" << objectives_priority_idx << "]=" << objectives_priorities[objectives_priority_idx] - << std::endl; -#endif - resource_leeway = leeway; + resource_requested = true; + resource_weight = opt_weight; #ifdef DEBUG_INIT_ - std::cout << "Set resource margin to " << resource_leeway << "\n" + std::cout << "Set resource weight to " << resource_weight << "\n" << std::flush; - ; #endif } else if (obj == "energy") { - energy_requested = true; - objectives_priorities.push_back(energy); + energy_requested = true; + energy_weight = opt_weight; #ifdef DEBUG_INIT_ - std::cout << "Priority[" << objectives_priority_idx << "]=" << objectives_priorities[objectives_priority_idx] - << std::endl; -#endif - energy_leeway = leeway; -#ifdef DEBUG_INIT_ - std::cout << "Set energy margin to " << energy_leeway << "\n" + std::cout << "Set energy weight to " << energy_weight << "\n" << std::flush; - ; #endif } else { - std::ostringstream all_keys; - copy(scheduler::objectives.begin(), scheduler::objectives.end(), - std::ostream_iterator(all_keys, ",")); - std::string keys_str = all_keys.str(); - keys_str.pop_back(); + std::cout << "TRIED PARSING \"" << obj << "\"" << std::endl; HPX_THROW_EXCEPTION( hpx::bad_request, "scheduler::init", boost::str( - boost::format("Wrong objective: %s, Valid values: [%s]") % obj % - keys_str)); + boost::format("Wrong objective: Valid values: [time, energy, resource]"))); } - if (time_leeway > 1 || resource_leeway > 1 || energy_leeway > 1) { + if (time_weight > 2 || resource_weight > 2 || energy_weight > 2 + || time_weight < -2 || resource_weight < -2 || energy_weight < -2) { HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init", - "leeways should be within ]0, 1]"); + "Objective weights should be within [-2, 2]"); } - objectives_priority_idx++; - } - } - objectives_priority_idx--; - - /* Reading optional user provided input for granularity (step) of - adding/removing resources to/from the runtime (where resource=OS thread) */ - std::string input_resource_step_str = - hpx::get_config_entry("allscale.resource_step", ""); - if (!input_resource_step_str.empty()) { - - resource_step = std::stoul(input_resource_step_str); -#ifdef DEBUG_INIT_ - std::cout << "Resource step provided : " << resource_step << "\n"; -#endif - if (resource_step == 0 || resource_step >= os_thread_count) { - HPX_THROW_EXCEPTION( - hpx::bad_request, "scheduler::init", - "resource step should be within ]0, total nb threads["); } } @@ -393,18 +357,14 @@ void scheduler::init() { executors_.emplace_back(pool_name); } -#if defined(ALLSCALE_HAVE_CPUFREQ) if (multi_objectives) { - // reallocating objectives_status vector of vectors - objectives_status.resize(3); - for (int i = 0; i < 3; i++) { - objectives_status[i].resize(3); - } -#ifdef DEBUG_INIT_ + + #ifdef DEBUG_INIT_ std::cout << "\n****************************************************\n" << std::flush; - std::cout << "Policy selected: multi-objective set with time=" << time_leeway - << ", resource=" << resource_leeway - << ", energy=" << energy_leeway << "\n" + std::cout << "Policy selected: multi-objective set with time=" << time_weight + << ", energy=" << energy_weight + << ", resource=" << resource_weight + << "\n" << std::flush; std::cout << "Objectives Flags Set: \n" << "\tTime: " << time_requested << @@ -413,18 +373,16 @@ void scheduler::init() { "\tMulti-objective: " << multi_objectives << "\n" << std::flush; std::cout << "****************************************************\n" << std::flush; -#endif + #endif } if (energy_requested) initialize_cpu_frequencies(); -#ifdef MEASURE_MANUAL_ + #ifdef MEASURE_MANUAL_ if (manual_input_provided && input_objective_str.empty()) fix_allcores_frequencies(temp_idx); -#endif - -#endif + #endif initialized_ = true; #ifdef DEBUG_INIT_ @@ -442,64 +400,31 @@ void scheduler::init() { last_optimization_timestamp_ = t_duration_now; last_objective_measurement_timestamp_= t_duration_now; - std::list objectives_temp; - if (energy_requested){ - objective o_temp; - o_temp.type=energy; - o_temp.leeway=energy_leeway; - int i=0; - for(auto& el: objectives_priorities){ - if (el==energy){ - o_temp.priority=i; - break; - } - ++i; - } - objectives_temp.push_back(o_temp); - } - if (time_requested){ - objective o_temp; - o_temp.type=time; - o_temp.leeway=time_leeway; - int i=0; - for(auto& el: objectives_priorities){ - if (el==time){ - o_temp.priority=i; - break; - } - ++i; - } - objectives_temp.push_back(o_temp); - } - if (resource_requested){ - objective o_temp; - o_temp.type=resource; - o_temp.leeway=resource_leeway; - int i=0; - for(auto& el: objectives_priorities){ - if (el==resource){ - o_temp.priority=i; - break; - } - ++i; - } - objectives_temp.push_back(o_temp); - } - lopt_.setobjectives(objectives_temp); lopt_.setmaxthreads(os_thread_count); - lopt_.reset(os_thread_count,0); - #if defined(ALLSCALE_HAVE_CPUFREQ) + + #if defined(ALLSCALE_HAVE_CPUFREQ) using hardware_reconf = allscale::components::util::hardware_reconf; - std::vector freq_temp = - lopt_.setfrequencies(hardware_reconf::get_frequencies(0)); + auto freqs = hardware_reconf::get_frequencies(0); + + auto freq_temp = lopt_.setfrequencies(freqs); if (freq_temp.empty()){ HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init", "error in initializing the local optimizer, allowed frequency values are empty"); } - #endif -#ifdef DEBUG_ + // VV: Set to max number of threads and max frequency + lopt_.reset(os_thread_count, freqs.size()-1); + #else + // VV: Max number of threads, and an arbitrary frequency index + lopt_.reset(os_thread_count,0); + auto freq_temp = lopt_.setfrequencies({0}); + #endif + + // VV: Set objectives after setting all constraints to + // trigger the initialization of nmd + lopt_.setobjectives(time_weight, energy_weight, resource_weight); + #ifdef DEBUG_ lopt_.printobjectives(); -#endif + #endif } } @@ -512,16 +437,13 @@ void scheduler::init() { * potential. * */ -void scheduler::initialize_cpu_frequencies() { #if defined(ALLSCALE_HAVE_CPUFREQ) +void scheduler::initialize_cpu_frequencies() +{ using hardware_reconf = allscale::components::util::hardware_reconf; cpu_freqs = hardware_reconf::get_frequencies(0); - freq_step = 8; // cpu_freqs.size() / 2; - freq_times.resize(cpu_freqs.size()); - -#ifdef MEASURE_ -#ifdef ALLSCALE_HAVE_CPUFREQ -#ifdef DEBUG_INIT_ + + #if defined(MEASURE_) && defined(DEBUG_INIT) unsigned long temp_transition_latency=hardware_reconf::get_cpu_transition_latency(1); if (temp_transition_latency==0) std::cout << "[INFO] Transition Latency Unavailable" << @@ -530,45 +452,37 @@ void scheduler::initialize_cpu_frequencies() { std::cout << "[INFO] Core-1 Frequency Transition Latency = " << hardware_reconf::get_cpu_transition_latency(2)/1000 << " milliseconds\n" << std::flush; -#endif -#endif -#endif -#ifdef DEBUG_INIT_ + #endif + + #ifdef DEBUG_INIT_ std::cout << "[INFO] Governors available on the system: " << "\n" << std::flush; -#ifdef ALLSCALE_HAVE_CPUFREQ std::vector temp_governors = hardware_reconf::get_governors(0); for (std::vector::const_iterator i = temp_governors.begin(); i != temp_governors.end(); ++i) std::cout << "[INFO]\t" << *i << "\n" << std::flush; -#endif std::cout << "\n" << std::flush; -#endif -#ifdef DEBUG_INIT_ std::cout << "Server Processor Available Frequencies (size = " << cpu_freqs.size() << ")"; for (auto &ind : cpu_freqs) { std::cout << ind << " "; } std::cout << "\n" << std::flush; -#endif + #endif auto min_max_freqs = std::minmax_element(cpu_freqs.begin(), cpu_freqs.end()); min_freq = *min_max_freqs.first; max_freq = *min_max_freqs.second; - -#ifdef DEBUG_INIT_ - std::cout << "Min freq: " << min_freq << ", Max freq: " << max_freq << "\n" - << std::flush; -#endif // TODO: verify that nbpus == all pus of the system, not just the online // ones size_t nbpus = topo_->get_number_of_pus(); -#ifdef DEBUG_INIT_ + + #ifdef DEBUG_INIT_ + std::cout << "Min freq: " << min_freq << ", Max freq: " << max_freq << "\n" + << std::flush; std::cout << "nbpus known to topo_: " << nbpus << "\n" << std::flush; -#endif + #endif -#ifdef ALLSCALE_HAVE_CPUFREQ hardware_reconf::make_cpus_online(0, nbpus); hardware_reconf::topo_init(); // We have to set CPU governors to userpace in order to change frequencies @@ -579,13 +493,12 @@ void scheduler::initialize_cpu_frequencies() { topo = hardware_reconf::read_hw_topology(); // first reinitialize to a normal setup - for (unsigned int cpu_id = 0; cpu_id < topo.num_logical_cores; cpu_id++) { + for (unsigned int cpu_id = 0; cpu_id < topo.num_logical_cores; cpu_id++){ hardware_reconf::set_freq_policy(cpu_id, policy); -#ifdef DEBUG_INIT_ - std::cout << "cpu_id " << cpu_id << " back to on-demand. ret= " << res - << "\n" - << std::flush; -#endif + #ifdef DEBUG_INIT_ + std::cout << "cpu_id " << cpu_id << " back to on-demand. ret= " + << res << std::endl; + #endif } governor = "userspace"; @@ -593,8 +506,10 @@ void scheduler::initialize_cpu_frequencies() { policy.min = min_freq; policy.max = max_freq; - for (unsigned int cpu_id = 0; cpu_id < topo.num_logical_cores; - cpu_id += topo.num_hw_threads) { + for (unsigned int cpu_id = 0; + cpu_id < topo.num_logical_cores; + cpu_id += topo.num_hw_threads) + { int res = hardware_reconf::set_freq_policy(cpu_id, policy); if (res) { HPX_THROW_EXCEPTION(hpx::bad_request, "scheduler::init", @@ -603,34 +518,29 @@ void scheduler::initialize_cpu_frequencies() { return; } -#ifdef DEBUG_INIT_ + #ifdef DEBUG_INIT_ std::cout << "cpu_id " << cpu_id << " initial freq policy setting. ret= " << res << "\n" << std::flush; -#endif + #endif } -#endif - // Set frequency of all threads to max when we start - { - // set freq to all PUs used by allscale - for (std::size_t i = 0; i != thread_pools_.size(); ++i) { - std::size_t thread_count = thread_pools_[i]->get_os_thread_count(); - for (std::size_t j = 0; j < thread_count; j++) { - std::size_t pu_num = - rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset()); + // set freq to all PUs used by allscale + for (std::size_t i = 0; i != thread_pools_.size(); ++i) { + std::size_t thread_count = thread_pools_[i]->get_os_thread_count(); + for (std::size_t j = 0; j < thread_count; j++) { + std::size_t pu_num = + rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset()); -#ifdef ALLSCALE_HAVE_CPUFREQ - if (!cpufreq_cpu_exists(pu_num)) { - hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[0]); -#ifdef DEBUG_INIT_ - std::cout << "Setting cpu " << pu_num << " to freq " << cpu_freqs[0] - << ", (ret= " << res << ")\n" - << std::flush; -#endif - } -#endif + + if (!cpufreq_cpu_exists(pu_num)) { + hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[0]); + #ifdef DEBUG_INIT_ + std::cout << "Setting cpu " << pu_num << " to freq " << cpu_freqs[0] + << ", (ret= " << res << ")\n" + << std::flush; + #endif } } } @@ -639,37 +549,33 @@ void scheduler::initialize_cpu_frequencies() { // Make sure frequency change happened before continuing std::cout << "topo.num_logical_cores: " << topo.num_logical_cores - << "topo.num_hw_threads" << topo.num_hw_threads << "\n" + << " topo.num_hw_threads" << topo.num_hw_threads << "\n" << std::flush; - { - // check status of Pus frequency -#ifdef ALLSCALE_HAVE_CPUFREQ - for (std::size_t i = 0; i != thread_pools_.size(); ++i) { - unsigned long hardware_freq = 0; - std::size_t thread_count = thread_pools_[i]->get_os_thread_count(); - for (std::size_t j = 0; j < thread_count; j++) { - std::size_t pu_num = - rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset()); - - if (!cpufreq_cpu_exists(pu_num)) { - do { - hardware_freq = hardware_reconf::get_hardware_freq(pu_num); -#ifdef DEBUG_INIT_ - std::cout << "current freq on cpu " << pu_num << " is " - << hardware_freq << " (target freq is " << cpu_freqs[0] - << " )\n" - << std::flush; - -#endif + // check status of Pus frequency + + for (std::size_t i = 0; i != thread_pools_.size(); ++i) { + unsigned long hardware_freq = 0; + std::size_t thread_count = thread_pools_[i]->get_os_thread_count(); + for (std::size_t j = 0; j < thread_count; j++) { + std::size_t pu_num = + rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset()); + + if (!cpufreq_cpu_exists(pu_num)) { + do { + hardware_freq = hardware_reconf::get_hardware_freq(pu_num); + #ifdef DEBUG_INIT_ + std::cout << "current freq on cpu " << pu_num << " is " + << hardware_freq << " (target freq is " << cpu_freqs[0] + << " )\n" + << std::flush; + #endif - } while (hardware_freq != cpu_freqs[0]); - } + } while (hardware_freq != cpu_freqs[0]); } } -#endif } -#ifdef ALLSCALE_USE_CORE_OFFLINING + #ifdef ALLSCALE_USE_CORE_OFFLINING // offline unused cpus for (unsigned int cpu_id = 0; cpu_id < topo.num_logical_cores; cpu_id += topo.num_hw_threads) { @@ -682,25 +588,23 @@ void scheduler::initialize_cpu_frequencies() { } if (!found_it) { -#ifdef DEBUG_INIT_ + #ifdef DEBUG_INIT_ std::cout << " setting cpu_id " << cpu_id << " offline \n" << std::flush; -#endif + #endif -#ifdef ALLSCALE_HAVE_CPUFREQ hardware_reconf::make_cpus_offline(cpu_id, cpu_id + topo.num_hw_threads); -#endif } } -#endif - + #endif +} #else - // should we really abort or should we reset energy to 1 ? - HPX_THROW_EXCEPTION( - hpx::bad_request, "scheduler::init", - "Requesting energy objective without having compiled with cpufreq"); -#endif +void scheduler::initialize_cpu_frequencies() +{ + cpu_freqs.clear(); + // VV: Bogus frequency + cpu_freqs.push_back(1000*1024); } - +#endif /** * @@ -717,9 +621,7 @@ void scheduler::optimize_locally(work_item const& work) // find out which pool has the most threads /* Count Active threads for validation*/ - hpx::threads::mask_type active_mask; - std::size_t active_threads_ = 0; std::size_t domain_active_threads = 0; std::size_t pool_idx = 0; int total_threads_counted=0; @@ -736,21 +638,13 @@ void scheduler::optimize_locally(work_item const& work) } } std::cout << "Active OS Threads = " << total_threads_counted << std::endl; -#endif -#ifdef MEASURE_ -#ifdef ALLSCALE_HAVE_CPUFREQ - std::size_t temp_id = work.id().id; - if ((temp_id >= period_for_power) && - (temp_id % period_for_power == 0)) - update_power_consumption(hardware_reconf::read_system_power()); -#endif #endif -#ifdef ALLSCALE_HAVE_CPUFREQ - if (uselopt && !lopt_.isConverged()){ + if (uselopt && !lopt_.isConverged()) { last_power_usage++; - current_power_usage = hardware_reconf::read_system_power(); + allscale::components::monitor *monitor_c = &allscale::monitor::get(); + current_power_usage = monitor_c->get_current_power(); power_sum += current_power_usage; auto t_now = std::chrono::system_clock::now(); @@ -760,6 +654,10 @@ void scheduler::optimize_locally(work_item const& work) long elapsedTimeMs = t_duration_now - last_objective_measurement_timestamp_; + auto dt_power = t_duration_now - last_measure_power; + last_measure_power = t_duration_now; + update_power_consumption(power_sum/last_power_usage, dt_power); + if (elapsedTimeMs > objective_measurement_period_ms){ last_objective_measurement_timestamp_= t_duration_now; @@ -773,60 +671,90 @@ void scheduler::optimize_locally(work_item const& work) #endif current_avg_iter_time = 0.0; } - - lopt_.measureObjective(current_avg_iter_time,power_sum/last_power_usage, + double last_objectives[] = {current_avg_iter_time,power_sum/(last_power_usage*monitor_c->get_max_power()), + active_threads}; + lopt_.measureObjective(current_avg_iter_time,power_sum/(last_power_usage*monitor_c->get_max_power()), active_threads); - last_power_usage=0; - power_sum=0; + + last_objective_score = lopt_.evaluate_score(last_objectives); + + auto power_dt = t_duration_now - last_measure_power; + update_power_consumption(power_sum/last_power_usage, power_dt); + last_measure_power = t_duration_now; + + // VV: instead of starting from scratch, remember the last power measurement + last_power_usage=1; + power_sum=current_power_usage; } elapsedTimeMs = t_duration_now - last_optimization_timestamp_; - if (elapsedTimeMs > optimization_period_ms){ + if (elapsedTimeMs > optimization_period_ms || nr_opt_steps == 0){ last_optimization_timestamp_= t_duration_now; nr_opt_steps++; - actuation act_temp = lopt_.step(); + actuation act_temp = lopt_.step(active_threads); #ifdef DEBUG_MULTIOBJECTIVE_ lopt_.printverbosesteps(act_temp); #endif - // amend threads if signaled - /* - if (act_temp.delta_threads<0){ - unsigned int suspended_temp = - suspend_threads(-1 * act_temp.delta_threads); - lopt_.setCurrentThreads(lopt_.getCurrentThreads()-suspended_temp); + auto dt_threads = t_duration_now - last_measure_threads; + update_active_osthreads(active_threads, dt_threads); + last_measure_threads = t_duration_now; + if (act_temp.threads < active_threads){ + suspend_threads(active_threads-act_temp.threads); } - else if (act_temp.delta_threads>0){ - unsigned int resumed_temp = - resume_threads(act_temp.delta_threads); - lopt_.setCurrentThreads(lopt_.getCurrentThreads()+resumed_temp); + else if (act_temp.threads > active_threads){ + resume_threads(act_temp.threads - active_threads); } - */ - - if (act_temp.delta_threads < active_threads){ -#ifdef DEBUG_MULTIOBJECTIVE_ - int new_threads_target = (int)active_threads - act_temp.delta_threads; - std::cout << "[SCHEDULER|INFO]: Optimizer induced threads to suspend: " << new_threads_target << std::endl; - std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << ", target threads = " << act_temp.delta_threads << std::endl; -#endif - //unsigned int suspended_temp = suspend_threads(new_threads_target); - //lopt_.setCurrentThreads(lopt_.getCurrentThreads()-suspended_temp); + fix_allcores_frequencies(act_temp.frequency_idx); + lopt_.setCurrentFrequencyIdx(act_temp.frequency_idx); + lopt_.setCurrentThreads(active_threads); - lopt_.setCurrentThreads(active_threads); - } - else if (act_temp.delta_threads > active_threads){ #ifdef DEBUG_MULTIOBJECTIVE_ - int new_threads_target = act_temp.delta_threads - (int)active_threads; - std::cout << "[SCHEDULER|INFO]: Optimizer induced threads to resume to: " << new_threads_target << std::endl; - std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << ", target threads = " << act_temp.delta_threads << std::endl; + std::cout << "[SCHEDULER|INFO]: Active Threads = " << active_threads << " out of " << lopt_.getmaxthreads() + << " , target threads = " << act_temp.threads << std::endl; #endif - fix_allcores_frequencies(act_temp.frequency_idx); - lopt_.setCurrentFrequencyIdx(act_temp.frequency_idx); - } } - } // uselopt -#endif - } + } + #ifdef MEASURE_ + else { + auto timestamp_now = std::chrono::time_point_cast(std::chrono::system_clock::now()).time_since_epoch().count(); + auto dt = timestamp_now - last_measure_power; + if ( dt >= 1000 ) { + allscale::components::monitor *monitor_c = &allscale::monitor::get(); + auto cur_power = monitor_c->get_current_power(); + + update_power_consumption(cur_power, dt); + last_measure_power = timestamp_now; + } + } + #endif + } +} + + +void scheduler::update_max_threads(std::size_t max_threads) +{ + std::cout << "Will try to set max threads to " << max_threads < max_threads ) + suspend_threads(active_threads - max_threads); + else if ( active_threads < max_threads ) + resume_threads(max_threads - active_threads); +} + +void scheduler::set_local_optimizer_weights(double time_weight, + double energy_weight, + double resource_weight) +{ + lopt_.setobjectives(time_weight, energy_weight, resource_weight); +} + +void scheduler::get_local_optimizer_weights(double *time_weight, + double *energy_weight, + double *resource_weight) +{ + lopt_.getobjectives(time_weight, energy_weight, resource_weight); } std::pair> scheduler::schedule_local(work_item work, @@ -1057,10 +985,6 @@ unsigned int scheduler::suspend_threads(std::size_t suspendthreads) { std::cout << "total active PUs: " << active_threads_ << "\n"; #endif -#ifdef MEASURE_ - update_active_osthreads(active_threads_-active_threads); -#endif - active_threads = active_threads_; growing = false; @@ -1122,9 +1046,6 @@ unsigned int scheduler::suspend_threads(std::size_t suspendthreads) { ) ); } -#ifdef MEASURE_ - update_active_osthreads(-1 * suspend_threads.size()); -#endif active_threads = active_threads - suspend_threads.size(); @@ -1243,10 +1164,6 @@ unsigned int scheduler::resume_threads(std::size_t resumethreads) { std::cout << "total active PUs: " << active_threads_ << "\n"; #endif -#ifdef MEASURE_ - update_active_osthreads(active_threads_-active_threads); -#endif - active_threads = active_threads_; // if no thread is suspended, nothing to do if (domain_blocked_threads == 0) { @@ -1302,9 +1219,6 @@ unsigned int scheduler::resume_threads(std::size_t resumethreads) { ) ); } -#ifdef MEASURE_ - update_active_osthreads(resume_threads.size()); -#endif active_threads = active_threads + resume_threads.size(); #ifdef DEBUG_THREADSTATUS_ std::cout << "[SCHEDULER|INFO]: Thread Resume - Newly Active Threads: " << active_threads @@ -1334,9 +1248,9 @@ void scheduler::fix_allcores_frequencies(int frequency_idx){ // ones size_t nbpus = topo_->get_number_of_pus(); -#ifdef DEBUG_FREQSCALING_ + #ifdef DEBUG_FREQSCALING_ std::cout << "nbpus known to topo_: " << nbpus << "\n" << std::flush; -#endif + #endif hardware_reconf::make_cpus_online(0, nbpus); hardware_reconf::topo_init(); @@ -1357,117 +1271,104 @@ void scheduler::fix_allcores_frequencies(int frequency_idx){ "set cpu frequency"); return; } -#ifdef DEBUG_FREQSCALING_ + #ifdef DEBUG_FREQSCALING_ std::cout << "cpu_id " << cpu_id << " initial freq policy setting. ret= " << res << "\n" << std::flush; -#endif + #endif } - - { - // set freq of all cores used to min - for (std::size_t i = 0; i != thread_pools_.size(); ++i) { - std::size_t thread_count = thread_pools_[i]->get_os_thread_count(); - for (std::size_t j = 0; j < thread_count; j++) { - std::size_t pu_num = - rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset()); - - if (!cpufreq_cpu_exists(pu_num)) { - //int res = hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[cpu_freqs[.size()-1]]); - int res = hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[frequency_idx]); - (void)res; -#if defined(MEASURE_MANUAL_) - fixed_frequency_ = cpu_freqs[frequency_idx]; -#endif -#ifdef DEBUG_FREQSCALING_ - //std::cout << "Setting cpu " << pu_num << " to freq " << cpu_freqs[cpu_freqs.size()-1] - std::cout << "Setting cpu " << pu_num << " to freq " << cpu_freqs[frequency_idx] - << ", (ret= " << res << ")\n" - << std::flush; -#endif - } + // set freq of all cores used to min + for (std::size_t i = 0; i != thread_pools_.size(); ++i) { + std::size_t thread_count = thread_pools_[i]->get_os_thread_count(); + for (std::size_t j = 0; j < thread_count; j++) { + std::size_t pu_num = + rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset()); + + if (!cpufreq_cpu_exists(pu_num)) { + //int res = hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[cpu_freqs[.size()-1]]); + int res = hardware_reconf::set_frequency(pu_num, 1, cpu_freqs[frequency_idx]); + (void)res; + #if defined(MEASURE_MANUAL_) + fixed_frequency_ = cpu_freqs[frequency_idx]; + #endif + #ifdef DEBUG_FREQSCALING_ + //std::cout << "Setting cpu " << pu_num << " to freq " << cpu_freqs[cpu_freqs.size()-1] + std::cout << "Setting cpu " << pu_num << " to freq " << cpu_freqs[frequency_idx] + << ", (ret= " << res << ")\n" + << std::flush; + #endif } } } - { - // check status of Pus frequency - for (std::size_t i = 0; i != thread_pools_.size(); ++i) { - unsigned long hardware_freq = 0; - std::size_t thread_count = thread_pools_[i]->get_os_thread_count(); - for (std::size_t j = 0; j < thread_count; j++) { - std::size_t pu_num = - rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset()); - if (!cpufreq_cpu_exists(pu_num)) { - do { - hardware_freq = hardware_reconf::get_hardware_freq(pu_num); -#ifdef DEBUG_FREQSCALING_ - std::cout << "current freq on cpu " << pu_num << " is " - //<< hardware_freq << " (target freq is " << cpu_freqs[cpu_freqs.size()-1] - << hardware_freq << " (target freq is " << cpu_freqs[frequency_idx] - << " )\n" - << std::flush; + // check status of Pus frequency + for (std::size_t i = 0; i != thread_pools_.size(); ++i) { + unsigned long hardware_freq = 0; + std::size_t thread_count = thread_pools_[i]->get_os_thread_count(); + for (std::size_t j = 0; j < thread_count; j++) { + std::size_t pu_num = + rp_->get_pu_num(j + thread_pools_[i]->get_thread_offset()); -#endif + if (!cpufreq_cpu_exists(pu_num)) { + do { + hardware_freq = hardware_reconf::get_hardware_freq(pu_num); + #ifdef DEBUG_FREQSCALING_ + std::cout << "current freq on cpu " << pu_num << " is " + //<< hardware_freq << " (target freq is " << cpu_freqs[cpu_freqs.size()-1] + << hardware_freq << " (target freq is " << cpu_freqs[frequency_idx] + << " )\n" - //} while (hardware_freq != cpu_freqs[cpu_freqs.size()-1]); - } while (hardware_freq != cpu_freqs[frequency_idx]); - } + << std::flush; + #endif + //} while (hardware_freq != cpu_freqs[cpu_freqs.size()-1]); + } while (hardware_freq != cpu_freqs[frequency_idx]); } } } + +} +#else +void scheduler::fix_allcores_frequencies(int frequency_idx) +{ + // VV: This is a stub } #endif #ifdef MEASURE_ -void scheduler::update_active_osthreads(std::size_t delta) { - std::size_t temp = active_threads + delta; - if (meas_active_threads_max==0) - meas_active_threads_max=temp; +void scheduler::update_active_osthreads(std::size_t threads, int64_t delta_time) { - if (meas_active_threads_min==0) - meas_active_threads_min=temp; + if (meas_active_threads_max==0 || meas_active_threads_max < threads) + meas_active_threads_max=threads; - if (meas_active_threads_sum==0){ - meas_active_threads_count++; - meas_active_threads_sum=active_threads; - return; - } + if (meas_active_threads_min==0 || meas_active_threads_min > threads) + meas_active_threads_min=threads; - if ((temp >= min_threads) && (temp <= os_thread_count)){ - meas_active_threads_count++; - meas_active_threads_sum+=temp; - if (temp > meas_active_threads_max) - meas_active_threads_max=temp; - if (temp < meas_active_threads_min) - meas_active_threads_min=temp; - } + meas_active_threads_count += delta_time; + meas_active_threads_sum += threads * delta_time; + + std::cout <<"REGISTERING THREADS " << threads << " for " << delta_time << + " current average " << (meas_active_threads_sum/meas_active_threads_count) << std::endl; } -void scheduler::update_power_consumption(std::size_t power_sample) { - if (meas_power_max==0) +void scheduler::update_power_consumption(std::size_t power_sample, int64_t delta_time) +{ + if ( power_sample > 10000) + return; + + if (meas_power_max==0 || meas_power_max < power_sample) meas_power_max=power_sample; - if (meas_power_min==0) + if (meas_power_min==0 || meas_power_min > power_sample) meas_power_min=power_sample; - if (meas_power_sum==0){ - meas_power_count++; - meas_power_sum=power_sample; - return; - } - if (power_sample <= 10000){ - meas_power_count++; - meas_power_sum+=power_sample; - if (power_sample > meas_power_max) - meas_power_max=power_sample; - if (power_sample < meas_power_min) - meas_power_min=power_sample; - } + meas_power_count += delta_time; + meas_power_sum += power_sample * delta_time; + + std::cout << "Reporting Threads:" << active_threads << " Power:" << power_sample << " for Dt:" << delta_time << std::endl; } #endif @@ -1494,51 +1395,33 @@ void scheduler::stop() { ++pool_idx; } } - - /* - - if (energy_requested) { -#if defined(ALLSCALE_HAVE_CPUFREQ) - - for (int cpu_id = 0; cpu_id < topo.num_logical_cores; - cpu_id += topo.num_hw_threads) { - bool found_it = false; - for (std::size_t i = 0; i != thread_pools_.size(); i++) { - if (hpx::threads::test(initial_masks_[i], cpu_id)) - found_it = true; - } - - if (!found_it) { -#ifdef DEBUG_ - std::cout << " setting cpu_id " << cpu_id << " back online \n" - << std::flush; -#endif - - hardware_reconf::make_cpus_online(cpu_id, cpu_id + topo.num_hw_threads); - } - } - - std::string governor = "ondemand"; - policy.governor = const_cast(governor.c_str()); - std::cout << "Set CPU governors back to " << governor << std::endl; - for (int cpu_id = 0; cpu_id < topo.num_logical_cores; - cpu_id += topo.num_hw_threads) - int res = hardware_reconf::set_freq_policy(cpu_id, policy); -#endif - } - */ - stopped_ = true; - // work_queue_cv_.notify_all(); - // std::cout << "rank(" << rank_ << "): scheduled " << count_ << "\n"; - /* Output all measured metrics */ #ifdef DEBUG_MULTIOBJECTIVE_ #ifdef MEASURE_ + auto timestamp_now = std::chrono::time_point_cast(std::chrono::system_clock::now()).time_since_epoch().count(); + auto dt_threads = timestamp_now - last_measure_threads; + auto dt_power = timestamp_now - last_measure_power; + + last_measure_power = timestamp_now; + last_measure_threads = timestamp_now; + + update_active_osthreads(active_threads, dt_threads); + allscale::components::monitor *monitor_c = &allscale::monitor::get(); + + auto measurement = monitor_c->get_current_power(); + if ( measurement <= 10000 ) { + update_power_consumption(measurement, dt_power); + } + + if ( meas_active_threads_count == 0 ) + meas_active_threads_count = 1; + if ( meas_power_count == 0 ) + meas_power_count = 1; + std::cout << "\n****************************************************\n" << std::flush; std::cout << "Measured Metrics of Application Execution:\n" - << "\tTotal number of tasks scheduled locally (#taskslocal) = " << nr_tasks_scheduled << std::endl @@ -1571,5 +1454,6 @@ void scheduler::stop() { #endif } -} -} + +} // components +} // allscale diff --git a/src/components/util/hardware_reconf.cpp b/src/components/util/hardware_reconf.cpp index 4cf1491..b515977 100644 --- a/src/components/util/hardware_reconf.cpp +++ b/src/components/util/hardware_reconf.cpp @@ -5,6 +5,7 @@ #include #include #include +#include // std::sort #include @@ -25,6 +26,7 @@ namespace allscale { namespace components { namespace util { if (available_frequencies != nullptr) cpufreq_put_available_frequencies(available_frequencies); + std::sort(frequencies.begin(), frequencies.end()); return frequencies; } diff --git a/src/dashboard.cpp b/src/dashboard.cpp index ee326f1..8de511f 100644 --- a/src/dashboard.cpp +++ b/src/dashboard.cpp @@ -23,6 +23,9 @@ #include +// VV: Define this to use time/energy/resources instead of speed/energy/efficiency +// #define ALTERNATIVE_SCORE + namespace allscale { namespace dashboard { node_state get_state() @@ -54,17 +57,25 @@ namespace allscale { namespace dashboard state.max_frequency = monitor_c->get_max_freq(0); std::size_t active_cores = scheduler::get().get_active_threads(); - + state.last_local_score = scheduler::get().get_last_objective_score(); state.productive_cycles_per_second = float(state.cur_frequency) * (1.f - state.idle_rate); // freq to Hz +#if defined(ALTERNATIVE_SCORE) + state.speed = monitor_c->get_avg_time_last_iterations(100); + state.efficiency = active_cores; +#else state.speed = 1.f - state.idle_rate; state.efficiency = state.speed * (float(state.cur_frequency * active_cores) / float(state.max_frequency * state.num_cores)); +#endif -#ifdef POWER_ESTIMATE +#if defined(POWER_ESTIMATE) || defined(ALLSCALE_HAVE_CPUFREQ) state.cur_power = monitor_c->get_current_power(); state.max_power = monitor_c->get_max_power(); - state.power = state.cur_power / state.max_power; +#else + state.max_power = 1.0; + state.cur_power = 1.0; #endif + state.power = state.cur_power / state.max_power; return state; } @@ -99,6 +110,7 @@ namespace allscale { namespace dashboard ar & speed; ar & efficiency; ar & power; + ar & last_local_score; } std::string node_state::to_json() const @@ -164,9 +176,15 @@ namespace allscale { namespace dashboard float system_state::score() const { +#if defined(ALTERNATIVE_SCORE) + return std::exp(speed * speed_exponent) * + std::exp(efficiency * efficiency_exponent ) * + std::exp(power * power_exponent); +#else return std::pow(speed, speed_exponent) * std::pow(efficiency, efficiency_exponent) * std::pow(1 - power, power_exponent); +#endif } template void node_state::serialize(hpx::serialization::input_archive& ar, unsigned); @@ -208,7 +226,7 @@ namespace allscale { namespace dashboard const char* host_env = std::getenv(ENVVAR_DASHBOARD_IP); const char* port_env = std::getenv(ENVVAR_DASHBOARD_PORT); - + std::string host; if (host_env) { @@ -298,11 +316,11 @@ namespace allscale { namespace dashboard buffers[0] = boost::asio::buffer(&m->msg_size, sizeof(std::uint64_t)); buffers[1] = boost::asio::buffer(m->json.data(), m->json.length()); -/* + /* std::cout << "Sending -----------------------------------\n"; std::cout << m->json << '\n'; std::cout << "Sending done ------------------------------\n"; -*/ + */ boost::asio::async_write(socket_, buffers, [f = std::move(f), m](boost::system::error_code ec, std::size_t /*length*/) { @@ -431,6 +449,7 @@ namespace allscale { namespace dashboard std::vector localities_; std::uint64_t time = 0; bool enabled_; + double use_gopt, use_lopt; }; dashboard_client& dashboard_client::get() @@ -490,13 +509,18 @@ namespace allscale { namespace dashboard total_efficiency += cur.efficiency; cur_power += cur.cur_power; } + max_power += cur.max_power; } state.speed = total_speed / client.localities_.size(); // state.speed = std::pow(total_speed, 1.f/client.localities_.size()); - +#if defined(ALTERNATIVE_SCORE) + // VV: This is the number of active threads + state.efficiency = total_efficiency; +#else state.efficiency = total_efficiency / client.localities_.size(); +#endif state.power = (max_power > 0) ? cur_power/max_power : 0; auto exponents = scheduler::get_optimizer_exponents(); diff --git a/src/optimizer.cpp b/src/optimizer.cpp index 19731e8..389aa5e 100644 --- a/src/optimizer.cpp +++ b/src/optimizer.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -25,11 +26,20 @@ #define TRULY_RANDOM_DEBUG +#define DEBUG_NMD_INO 1 + +#ifdef DEBUG_NMD_INO +#define OUT_DEBUG(X) X +#else +#define OUT_DEBUG(X) \ + { \ + } +#endif + namespace allscale { optimizer_state get_optimizer_state() { - static float last_energy = 0.f; float load = 1.f - monitor::get().get_idle_rate(); float my_time = monitor::get().get_avg_time_last_iterations(HISTORY_ITERATIONS); @@ -37,18 +47,19 @@ namespace allscale my_time = -1.f; allscale::components::monitor *monitor_c = &allscale::monitor::get(); - float energy = 100.f; -#ifdef POWER_ESTIMATE - energy = monitor_c->get_current_power(); + float power_now = 0.001f; +#if defined(POWER_ESTIMATE) || defined(ALLSCALE_HAVE_CPUFREQ) + power_now = monitor_c->get_current_power() / monitor_c->get_max_power(); #endif - + // VV: Use power as if it were energy return { load, monitor::get().get_task_times(), my_time, - energy, + power_now, float(monitor_c->get_current_freq(0)), - scheduler::get().get_active_threads() + scheduler::get().get_active_threads(), + scheduler::get().get_total_threads() }; } // optimizer_state get_optimizer_state() @@ -89,11 +100,15 @@ namespace allscale scheduler::apply_new_mapping(new_mapping); } + void optimizer_update_max_threads(std::size_t max_threads) { + scheduler::update_max_threads(max_threads); + } } // namespace allscale HPX_PLAIN_DIRECT_ACTION(allscale::get_optimizer_state, allscale_get_optimizer_state_action); HPX_PLAIN_DIRECT_ACTION(allscale::optimizer_update_policy, allscale_optimizer_update_policy_action); HPX_PLAIN_DIRECT_ACTION(allscale::optimizer_update_policy_ino, allscale_optimizer_update_policy_action_ino); +HPX_PLAIN_DIRECT_ACTION(allscale::optimizer_update_max_threads, allscale_optimizer_update_max_threads); namespace allscale { @@ -127,6 +142,19 @@ tuning_objective get_default_objective() return tuning_objective::efficiency(); if (obj == "power") return tuning_objective::power(); + if ( obj == "local") { + double time_weight, energy_weight, resource_weight; + + auto &&local_scheduler = scheduler::get(); + + local_scheduler.get_local_optimizer_weights(&time_weight, + &energy_weight, + &resource_weight); + // VV: If the local-optimizer is used too then copy its objectives + return tuning_objective(time_weight, + resource_weight, + energy_weight); + } float speed = 0.0f; float efficiency = 0.0f; @@ -170,14 +198,28 @@ float estimate_power(float frequency) global_optimizer::global_optimizer() : u_balance_every(10), u_steps_till_rebalance(u_balance_every), - active_nodes_(allscale::get_num_localities(), true), tuner_(new simple_coordinate_descent(tuner_configuration{active_nodes_, allscale::monitor::get().get_current_freq(0)})), + active_nodes_(allscale::get_num_localities(), true), objective_(get_default_objective()), active_(true), localities_(hpx::find_all_localities()), - f_resource_max(-1.0f), f_resource_leeway(-1.0f) + f_resource_max(-1.0f), f_resource_leeway(-1.0f), + nmd(0.005), + nmd_initialized(0), + nodes_min(1), nodes_max(localities_.size()), threads_min(0), threads_max(0), + last_optimization_score(1.0) { char *const c_policy = std::getenv("ALLSCALE_SCHEDULING_POLICY"); - - if (c_policy && strncasecmp(c_policy, "ino", 3) == 0 ) + char *const c_tuner = std::getenv("ALLSCALE_TUNER"); + + std::string input_objective_str = + hpx::get_config_entry("allscale.objective", ""); + + if ( input_objective_str == "allscale" ) + use_lopt = true; + else + use_lopt = false; + previous_num_nodes = localities_.size(); + + if (c_policy && strcasecmp(c_policy, "ino") == 0 ) { char *const c_resource_max = std::getenv("ALLSCALE_RESOURCE_MAX"); char *const c_resource_leeway = std::getenv("ALLSCALE_RESOURCE_LEEWAY"); @@ -195,20 +237,64 @@ global_optimizer::global_optimizer() f_resource_max = 0.75f; else f_resource_max = atof(c_resource_max); + + nodes_min = f_resource_leeway * localities_.size(); + } + nodes_max = localities_.size(); + + if ( nodes_min < 1 ) + nodes_min = 1; + + if ( c_policy && strcasecmp(c_policy, "ino")) o_ino = allscale::components::internode_optimizer_t(localities_.size(), (double) f_resource_max, (double) f_resource_leeway, INO_DEFAULT_FORGET_AFTER); + + if ( c_policy && strcasecmp(c_policy, "ino_nmd")) { + char *const c_threads_min = std::getenv("ALLSCALE_GINO_THREADS_MIN"); + char *const c_threads_max = std::getenv("ALLSCALE_GINO_THREADS_MAX"); + + if ( c_threads_min ) + threads_min = atoi(c_threads_min); + + if ( c_threads_max ) + threads_max = atoi(c_threads_max); } -// else if ( strncasecmp(c_policy, "truly_random", 12) == 0 ) { -// char *const c_balance_every = std::getenv("ALLSCALE_TRULY_RANDOM_BALANCE_EVERY"); -// -// if ( c_balance_every ) { -// u_balance_every = (std::size_t) atoi(c_balance_every); -// u_steps_till_rebalance = u_balance_every; -// } -// } + + // VV: Guestimate that max iter time is 500 ms (will be refined over time) + objectives_scale[0] = 0.5; + objectives_scale[1] = 1.0; + objectives_scale[2] = 1.0; + + if (c_policy && strcasecmp(c_policy, "neldermead")) { + std::cout << "Choosing NelderMead Optimizer for global optimization" << std::endl; + tuner_ = std::make_unique(nodes_min, nodes_max); + } + else { + std::cout << "Choosing Coordinate Descent Optimizer for global optimization" << std::endl; + tuner_ = std::make_unique(tuner_configuration{active_nodes_, allscale::monitor::get().get_current_freq(0)}); + } +} + +double global_optimizer::get_optimization_score() +{ + return last_optimization_score; +} + +void global_optimizer::signal_objective_changed() +{ + const double new_weights[3] = { + objective_.speed_exponent, + objective_.power_exponent, + objective_.efficiency_exponent + }; + + nmd.set_weights(new_weights); + + if ( nmd_initialized ) + nmd_initialized = 0; } void global_optimizer::tune(std::vector const &state) @@ -234,7 +320,7 @@ void global_optimizer::tune(std::vector const &state) total_efficiency += state[i].load_ * (float(state[i].active_frequency_ * state[i].cores_per_node_) / float(max_frequency * state[i].cores_per_node_));; used_power += state[i].energy_; } -#ifdef POWER_ESTIMATE +#if defined(POWER_ESTIMATE) || defined(ALLSCALE_HAVE_CPUFREQ) max_power += monitor_c->get_max_power(); #endif } @@ -364,7 +450,7 @@ hpx::future global_optimizer::decide_random_mapping(const std::vector global_optimizer::decide_random_mapping(const std::vector global_optimizer::decide_random_mapping(const std::vector global_optimizer::balance_ino_nmd(const std::vector &old_mapping) +{ + u_steps_till_rebalance = u_balance_every; + return hpx::lcos::broadcast(localities_) + .then( + [this, old_mapping](hpx::future > future_state) { + std::lock_guard l(mtx_); + std::size_t num_active_nodes = std::count(active_nodes_.begin(), active_nodes_.end(), true); + + auto state = future_state.get(); + float avg_time = 0; + float avg_energy = 0; + float avg_threads = 0; + int from_node = 0; + + std::size_t num_avg_time = 0ul; + + for (const auto &s:state) { + // VV: Only keep track of nodes that were selected by last step + if ( from_node++ == previous_num_nodes ) + break; + + if ( s.avg_time_ > 0.0) { + avg_time += s.avg_time_; + num_avg_time ++; + } + + avg_energy += s.energy_; + avg_threads += s.active_cores_per_node_ / (float) s.cores_per_node_; + + ++from_node; + } + + if ( num_avg_time ) + avg_time /= num_avg_time; + else + avg_time = 0.0; + + avg_energy /= num_active_nodes; + avg_threads /= num_active_nodes; + + // VV: First record current state + double measurements[3] = {avg_time, + avg_energy, + avg_threads}; + + if ( objectives_scale[0] < avg_time ) { + objectives_scale[0] = avg_time * 2.0; + nmd.set_scale(objectives_scale); + } + + if ( nmd_initialized == 0 ) { + double weights[] = {(double) objective_.speed_exponent, + (double) objective_.efficiency_exponent, + (double) objective_.power_exponent}; + const double constraint_min[] = {(double) nodes_min, + (double) threads_min}; + const double constraint_max[] = {(double) nodes_max, + (double) threads_max}; + nmd.set_scale(objectives_scale); + + nmd.initialize_simplex(weights, + nullptr, + constraint_min, + constraint_max); + + nmd_initialized = 1; + } + + auto action = nmd.step(measurements, + previous_num_nodes, + avg_threads * previous_num_nodes); + + last_optimization_score = nmd.evaluate_score(measurements, nullptr); + + // VV: Todo do something with the action + // assume that .threads = nodes and .freq_idx = threads per node + int new_num_nodes = action.threads; + int new_threads_per_node = action.freq_idx; + + if ( new_num_nodes != previous_num_nodes ) { + // VV: Need to redistribute tasks to nodes. + // Try to move as few as possible tasks + /* VV: Balancing algorithm: + new_avg_tasks = ceil(total_tasks / new_num_nodes) + node_to_tasks{} = find out which tasks each node is computing() + + if ( new_num_nodes < previous_nodes ) { + // VV: Evenly distribute all now orphaned tasks to remaining nodes + orphaned_tasks = those which were running on the now unused nodes + for ( node:new_used_nodes ) { + old_tasks = size(node_to_tasks[node]) + added_to_node = 0; + while (remaining_orphaned + && added_to_node < new_avg_tasks-old_tasks) { + orphan = orphaned.pop() + node.tasks.push_back(orphan) + added_to_node ++; + } + } + } else if ( new_num_nodes > previous_node ) { + num_need_to_move = new_avg_tasks; + node_to_move = previous_nodes; + + // VV: Redistribute last tasks from overflowed nodes to new ones + while ( num_need_to_move > 0 && node_to_move < new_num_nodes ) { + for ( node:new_used_nodes ) { + if ( num_need_to_move == 0 ) { + if ( node_to_move < new_num_nodes) { + node_to_move ++; + num_need_to_move = new_avg_tasks; + } else { + break; + } + } + + task = node.tasks[-1] + node_to_tasks[node_to_move].tasks.push_back(task) + num_need_to_move -- + } + } + } + */ + // VV: Some of the nodes might be dead, convert the virtual name + // to the physical name + auto virtual_to_physical = std::vector(); + + std::size_t cur_node = 0ul; + + for (const auto &physical:active_nodes_) { + if ( physical ) { + OUT_DEBUG( + std::cout << "[Ino_NMD] Node " << cur_node << " is was used last time" << std::endl; + ) + } else { + OUT_DEBUG( + std::cout << "[Ino_NMD] Node " << cur_node << " was not used last time" << std::endl; + ) + } + + virtual_to_physical.push_back(cur_node); + cur_node ++; + } + num_active_nodes = active_nodes_.size(); + + if ( new_num_nodes > num_active_nodes ) + new_num_nodes = num_active_nodes; + + if ( previous_num_nodes > num_active_nodes ) + previous_num_nodes = num_active_nodes; + + auto new_avg_tasks = (std::size_t) std::ceil(old_mapping.size()/ + (float)new_num_nodes); + auto new_mapping = std::vector(old_mapping.size(), 0ul); + auto node_to_tasks = std::map >(); + // VV: node_to_tasks maps node id to list of tasks that it's running + std::size_t task_id = 0; + + + for (auto i=0ul; i())); + + for ( const auto &node_id:old_mapping ) + node_to_tasks[node_id].push_back(task_id++); + + OUT_DEBUG( + std::cout << "[Ino_NMD] Rebalancing (original):" << std::endl; + + for ( const auto &node: node_to_tasks ) { + std::cout << "node " << node.first << ": "; + for ( const auto &task:node.second) + std::cout << " " << task; + std::cout << std::endl; + } + ) + + // VV: Something else is setting the scheduling policy too + // try to redistribute tasks to all @previous_num_nodes + OUT_DEBUG( + std::cout << "[GLOBAL OPTIMIZER] Re-balancing previous nodes" << std::endl; + ) + + auto prev_avg_tasks = + (std::size_t) std::ceil(old_mapping.size() / + (float)previous_num_nodes); + auto node_fewer_tasks = 1ul; + + for (auto node_id = 0ul; node_id < num_active_nodes; ++node_id) + { + auto &node = node_to_tasks[node_id]; + while (node.size() > prev_avg_tasks) + { + while (node_to_tasks[node_fewer_tasks].size() >= prev_avg_tasks) + if (++node_fewer_tasks == previous_num_nodes) + break; + + if (node_fewer_tasks == previous_num_nodes) + break; + + auto task = node.back(); + node.pop_back(); + node_to_tasks[node_fewer_tasks].push_back(task); + } + } + + OUT_DEBUG( + std::cout << "[GLOBAL OPTIMIZER] Rebalanced (still original):" << std::endl; + + for ( const auto &node: node_to_tasks ) { + std::cout << "node " << node.first << ": "; + for ( const auto &task:node.second) + std::cout << " " << task; + std::cout << std::endl; + } + + + std::cout << "[GLOBAL OPTIMIZER] Changing nodes from " + << previous_num_nodes + << " to " << new_num_nodes << std::endl; + ) + + if (new_num_nodes < previous_num_nodes) + { + OUT_DEBUG( + std::cout << "[GLOBAL OPTIMIZER] Decreasing nodes" << std::endl; + ) + auto lost_node = new_num_nodes; + + while (lost_node < previous_num_nodes && node_to_tasks[lost_node].size()) + { + for (auto node_id = 0ul; node_id < new_num_nodes; ++node_id) + { + auto &node = node_to_tasks[node_id]; + auto old_tasks = node.size(); + for (auto new_tasks = old_tasks; + lost_node < previous_num_nodes && new_tasks < new_avg_tasks; + new_tasks++) + { + // VV: Move next orphaned task to @node + while (node_to_tasks[lost_node].size() == 0) + { + if (++lost_node == previous_num_nodes) + break; + } + + if (lost_node == previous_num_nodes) + break; + + std::size_t task = node_to_tasks[lost_node].back(); + node_to_tasks[lost_node].pop_back(); + node.push_back(task); + } + } + } + } + else if (new_num_nodes > previous_num_nodes) + { + OUT_DEBUG( + std::cout << "[GLOBAL OPTIMIZER] Increasing nodes" << std::endl; + ) + auto new_node = previous_num_nodes - 1; + for (auto node_id = 0ul; node_id < previous_num_nodes; ++node_id) + { + auto &node = node_to_tasks[node_id]; + while (node.size() > new_avg_tasks) + { + while (node_to_tasks[new_node].size() >= new_avg_tasks) + if (++new_node == new_num_nodes) + break; + + if (new_node == new_num_nodes) + break; + + auto task = node.back(); + node.pop_back(); + node_to_tasks[new_node].push_back(task); + } + } + } + else + { + OUT_DEBUG( + std::cout << "[GLOBAL OPTIMIZER] Did not modify mapping" << std::endl; + ) + } + + if (previous_num_nodes != new_num_nodes ) { + OUT_DEBUG( + std::cout << "[GLOBAL OPTIMIZER] Rebalancing (NEW):" << std::endl; + + for ( const auto &node: node_to_tasks ) { + std::cout << "node " << node.first << ": "; + for ( const auto &task:node.second) + std::cout << " " << task; + std::cout << std::endl; + } + ) + + for (auto i = 0ul; i< new_mapping.size(); ++i) + new_mapping[i] = virtual_to_physical[new_mapping[i]]; + + previous_num_nodes = new_num_nodes; + hpx::lcos::broadcast_apply(localities_, new_mapping); + + for (auto i=0u; i(localities_, new_threads_per_node); + } + }); +} + hpx::future global_optimizer::balance_ino(const std::vector &old_mapping) { /*VV: Compute the new ino_knobs (i.e. number of Nodes), then assign tasks to @@ -475,6 +887,7 @@ hpx::future global_optimizer::balance_ino(const std::vector & #ifdef INO_DEBUG_DECIDE_SCHEDULE std::cerr << "Ino picked a schedule" << std::endl; #endif + for (auto node_wis : ino_schedule) for (auto wi : node_wis.second.v_work_items) new_mapping[wi] = node_wis.first; diff --git a/src/scheduler.cpp b/src/scheduler.cpp index 95367c6..05a7479 100644 --- a/src/scheduler.cpp +++ b/src/scheduler.cpp @@ -92,18 +92,9 @@ namespace allscale obj = objective_str.substr(0, idx); leeway = std::stod( objective_str.substr(idx + 1) ); } - - if (obj == "time") - { - enable_elasticity = true; - break; - } - else if (obj == "resource") - { - enable_elasticity = true; - break; - } } + + enable_elasticity = true; } rp.set_default_pool_name("allscale-numa-0"); @@ -175,6 +166,7 @@ namespace allscale * ALLSCALE_RESOURCE_LEEWAY = (0.0, 1.0) // extra percentage allowed to explore */ ino, + ino_nmd, random, truly_random }; @@ -194,8 +186,12 @@ namespace allscale return "tuned"; case ino: return "ino"; + case ino_nmd: + return "ino_nmd"; case random: return "random"; + case truly_random: + return "truly_random"; default: return "unknown"; } @@ -232,6 +228,19 @@ namespace allscale tree_scheduling_policy::create_uniform(allscale::get_num_localities()) }; } + if (policy == "ino_nmd" ) { + return { + replacable_policy::ino_nmd, + tree_scheduling_policy::create_uniform(allscale::get_num_localities()) + }; + } + if (policy == "truly_random") + { + return { + replacable_policy::truly_random, + tree_scheduling_policy::create_uniform(allscale::get_num_localities()) + }; + } if (policy == "random") { return { @@ -286,6 +295,8 @@ namespace allscale , right_id_(std::move(other.right_id_)) , is_root_(other.is_root_) , optimizer_(std::move(other.optimizer_)) + , use_gopt(other.use_gopt) + , use_lopt(other.use_lopt) { HPX_ASSERT(false); } @@ -298,6 +309,20 @@ namespace allscale , parent_(here_.getParent()) , is_root_(here_ == root_) { + char *const c_policy = std::getenv("ALLSCALE_SCHEDULING_POLICY"); + std::string input_objective_str = hpx::get_config_entry("allscale.objective", ""); + + if (c_policy && strcasecmp(c_policy, "ino") == 0 ) + use_gopt = true; + else + use_gopt = false; + + if ( input_objective_str == "allscale" ) + use_lopt = true; + else + use_lopt = false; + + if (parent_.getRank() != scheduler::rank()) { parent_id_ = hpx::naming::get_id_from_locality_id( @@ -325,7 +350,7 @@ namespace allscale if (is_root_) run(); } - + std::string policy() { return policy_.policy(); @@ -334,8 +359,13 @@ namespace allscale void apply_new_mapping(const std::vector &new_mapping) { std::lock_guard l(mtx_); - policy_.policy_ = tree_scheduling_policy::from_mapping(*policy_.policy_, - new_mapping); + policy_.policy_ = + tree_scheduling_policy::from_mapping(*policy_.policy_, new_mapping); + } + + void update_max_threads(std::size_t max_threads) { + auto &&local_scheduler = scheduler::get(); + local_scheduler.update_max_threads(max_threads); } void toggle_node(std::size_t locality_id) @@ -357,22 +387,74 @@ namespace allscale } } + double get_local_objective() { + auto &&local_scheduler = scheduler::get(); + return local_scheduler.get_last_objective_score(); + } + + double get_last_objective_score() + { + auto &&local_scheduler = scheduler::get(); + return local_scheduler.get_last_objective_score(); + } + void set_speed_exponent(float exp) { std::lock_guard l(optimizer_.mtx_); optimizer_.objective_.speed_exponent = exp; + optimizer_.signal_objective_changed(); + + double time_weight, energy_weight, resource_weight; + + auto &&local_scheduler = scheduler::get(); + + local_scheduler.get_local_optimizer_weights(&time_weight, + &energy_weight, + &resource_weight); + time_weight = (double) exp; + + local_scheduler.set_local_optimizer_weights(time_weight, + energy_weight, + resource_weight); } void set_efficiency_exponent(float exp) { std::lock_guard l(optimizer_.mtx_); optimizer_.objective_.efficiency_exponent = exp; + optimizer_.signal_objective_changed(); + + double time_weight, energy_weight, resource_weight; + + auto &&local_scheduler = scheduler::get(); + + local_scheduler.get_local_optimizer_weights(&time_weight, + &energy_weight, + &resource_weight); + resource_weight = (double) exp; + + local_scheduler.set_local_optimizer_weights(time_weight, + energy_weight, + resource_weight); } void set_power_exponent(float exp) { std::lock_guard l(optimizer_.mtx_); optimizer_.objective_.power_exponent = exp; + optimizer_.signal_objective_changed(); + double time_weight, energy_weight, resource_weight; + + auto &&local_scheduler = scheduler::get(); + + local_scheduler.get_local_optimizer_weights(&time_weight, + &energy_weight, + &resource_weight); + energy_weight = (double) exp; + + local_scheduler.set_local_optimizer_weights(time_weight, + energy_weight, + resource_weight); } hpx::util::tuple get_optimizer_exponents() @@ -385,6 +467,7 @@ namespace allscale ); } + bool use_gopt, use_lopt; void set_policy(std::string policy) { @@ -448,6 +531,16 @@ namespace allscale tree_scheduling_policy const& old = static_cast(*policy_.policy_); optimizer_.balance_ino(old.task_distribution_mapping()); } + + if ( policy_.value_ == replacable_policy::ino_nmd) { + tree_scheduling_policy const& old = static_cast(*policy_.policy_); + optimizer_.balance_ino_nmd(old.task_distribution_mapping()); + } + + if (policy_.value_ == replacable_policy::truly_random) { + tree_scheduling_policy const& old = static_cast(*policy_.policy_); + optimizer_.decide_random_mapping(old.task_distribution_mapping()); + } return true; } @@ -462,7 +555,7 @@ namespace allscale void schedule(work_item work) { - if (is_root_ && work.id().is_root() && work.id().id % 20 == 0) + if (is_root_ && work.id().is_root() && work.id().id % 5 == 0) { balance(); } @@ -667,6 +760,23 @@ namespace allscale ); } + double get_last_objective_score() + { + std::vector scores; + + runtime::HierarchicalOverlayNetwork::forAllLocal( + [&](scheduler_service& sched) + { + scores.push_back(sched.get_last_objective_score()); + } + ); + + std::cout << "GET_LAST_OBJETIVE_SCORE (SCHED): got " << scores.size() << " values" << std::endl; + for (const auto &score: scores ) { + std::cout << score << std::endl; + } + } + void set_efficiency_exponent_broadcast(float exp) { runtime::HierarchicalOverlayNetwork::forAllLocal( @@ -747,6 +857,16 @@ namespace allscale monitor::get().set_cur_freq(freq); } + void scheduler::update_max_threads(std::size_t max_threads) + { + runtime::HierarchicalOverlayNetwork::forAllLocal( + [&](scheduler_service& sched) + { + sched.update_max_threads(max_threads); + } + ); + } + void scheduler::apply_new_mapping(const std::vector &new_mapping) { runtime::HierarchicalOverlayNetwork::forAllLocal( diff --git a/src/tuner.cpp b/src/tuner.cpp index 546a2be..2203687 100644 --- a/src/tuner.cpp +++ b/src/tuner.cpp @@ -4,6 +4,8 @@ #include #include #include +#include + namespace allscale { std::ostream& operator<<(std::ostream& os, tuner_configuration const& cfg) @@ -204,4 +206,104 @@ namespace allscale { // print a status message std::cerr << "New search direction: " << (dim == num_nodes ? "#nodes" : "frequency") << " " << (dir == up ? "up" : "down") << "\n"; } + + nmd_optimizer::nmd_optimizer(std::size_t nodes_min, + std::size_t nodes_max) + : nmd(2, 3, 0.01, 2000, 50ul) + , converged(false) + { + constraint_min[0] = nodes_min; + constraint_max[0] = nodes_max; + + avail_freqs = monitor::get().get_available_freqs(0); + std::sort(avail_freqs.begin(), avail_freqs.end()); + + if ( avail_freqs.size() ) { + constraint_min[1] = 0; + constraint_max[1] = avail_freqs.size() - 1; + } else { + constraint_min[1] = 0; + constraint_max[1] = 0; + } + + previous_weights[0] = 0; + previous_weights[1] = 0; + previous_weights[2] = 0; + } + + tuner_configuration nmd_optimizer::next(tuner_configuration const& current_cfg, tuner_state const& current_state, tuning_objective obj) + { + tuner_configuration res; + auto action = std::vector(); + + const double weights[] = { + obj.speed_exponent, obj.efficiency_exponent, obj.power_exponent + }; + + double diff = 0.0; + + for (auto i=0ul; i<3; ++i) + diff += abs(previous_weights[i] - weights[i]); + + if ( diff > 0.01 ) { + // VV: Enforce exploration + initialized = false; + this->converged = false; + } + + for (auto i=0ul; i<3; ++i) + previous_weights[i] = weights[i]; + + if ( initialized == false ){ + nmd.initialize(constraint_min, + constraint_max, + nullptr, + weights, + &nmd.score_speed_efficiency_power); + initialized = true; + } + + if ( this->converged == false ) { + double measurements[3] = {current_state.speed, current_state.efficiency, current_state.power}; + + std::size_t num_active_nodes = std::count(current_cfg.node_mask.begin(), + current_cfg.node_mask.end(), + true); + std::size_t freq_idx; + auto e = std::find(avail_freqs.begin(), avail_freqs.end(), current_cfg.frequency); + + if ( e == avail_freqs.end() ) + freq_idx = 0; + else + freq_idx = e - avail_freqs.begin(); + + const std::size_t observed[] = {num_active_nodes, freq_idx}; + auto ret = nmd.get_next(measurements, observed); + action.assign(ret.first.begin(), ret.first.end()); + auto converged = ret.second; + + if (converged) { + best.assign(action.begin(), action.end()); + this->converged = true; + } + } else { + action.assign(best.begin(), best.end()); + } + + res.node_mask.assign(current_cfg.node_mask.begin(), + current_cfg.node_mask.end()); + + for (auto i=0ul; i