diff --git a/.gitignore b/.gitignore index 5387b99..626665b 100644 --- a/.gitignore +++ b/.gitignore @@ -97,3 +97,4 @@ rsconnect/ /attic/ .Rprofile kaggle/ +deepcave \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 4ab70c0..b3e3839 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,7 +27,8 @@ Imports: mlr3tuningspaces, paradox (>= 1.0.1), R6, - utils + utils, + jsonlite Suggests: catboost, e1071, @@ -62,4 +63,5 @@ Collate: 'LearnerClassifAutoXgboost.R' 'LearnerRegrAuto.R' 'helper.R' + 'save_deepcave_run.R' 'zzz.R' diff --git a/NAMESPACE b/NAMESPACE index 6fc0f11..14bb580 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(LearnerClassifAuto) export(LearnerClassifAutoSVM) export(LearnerClassifAutoXgboost) export(LearnerRegrAuto) +export(save_deepcave_run) import(R6) import(checkmate) import(data.table) diff --git a/R/save_deepcave_run.R b/R/save_deepcave_run.R new file mode 100644 index 0000000..c71745e --- /dev/null +++ b/R/save_deepcave_run.R @@ -0,0 +1,317 @@ +#' @title Save Tuning History as a DeepCAVE Run +#' +#' @description +#' Exports information stored in a `TuningInstance` in a format recognized by [DeepCAVE](https://automl.github.io/DeepCAVE/main/index.html) as a run. Each run is stored as a folder containing five files `configs.json`, `configspace.json`, `history.jsonl`, `meta.json`, and `origins.json`. +#' +#' @param instance ([TuningInstanceAsyncSingleCrit]) +#' Tuning instance to save. +#' +#' @param path (`character(1)`) +#' Path to save the run. Defaults to `"logs/mlr3automl`. +#' +#' @param prefix (`character(1)`) +#' Prefix for the name of a new subfolder under `path` for storing the current run. +#' +#' @param overwrite (`character(1)`) +#' If `FALSE` (default), creates a new subfolder to save the current run. If `TRUE`, all existing runs will be deleted. +#' +#' @export +#' @examples +#' \dontrun{ +#' rush_plan(n_workers = 2) +#' task = tsk("penguins") +#' +#' learner1 = lrn("classif.auto", +#' learner_ids = c("svm", "ranger"), +#' small_data_size = 1, +#' resampling = rsmp("holdout"), +#' measure = msr("classif.ce"), +#' terminator = trm("evals", n_evals = 6) +#' ) +#' learner1$train(task) +#' # save to `logs/mlr3automl/run_1` +#' save_deepcave_run(learner1$instance) +#' +#' # save to `logs/mlr3automl/run` +#' # if this folder already exists, it will be overwritten +#' save_deepcave_run(learner1$instance, overwrite = TRUE) +#' +#' learner2 = lrn("classif.auto", +#' learner_ids = c("catboost", "xgboost"), +#' small_data_size = 1, +#' resampling = rsmp("holdout"), +#' measure = msr("classif.ce"), +#' terminator = trm("evals", n_evals = 6) +#' ) +#' learner2$train(task) +#' # save to `logs/mlr3automl/run_2` +#' save_deepcave_run(learner2$instance) +#' } +save_deepcave_run = function(instance, path = "logs/mlr3automl", prefix = "run", overwrite = FALSE) { + # don't save untuned instance + if (is.null(instance$result_learner_param_vals)) { + warning("No run is saved, because no tuning has been completed.") + return() + } + + # create a subfolder for saving the current run + # original Python implementation see `Recorder._set_path()` + # (https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py) + if (!overwrite) { + new_idx = 0 + for (fn in list.files(path)) { + if (!startsWith(fn, prefix)) next + + splitted = strsplit(fn, "_")[[1]] + if (length(splitted) == 1) next # no run index attached + + idx = suppressWarnings(last(splitted)) + if (is.na(idx)) next # idx cannot be coerced to a number + + idx_int = as.integer(idx) + if (idx_int > new_idx) { + new_idx = idx_int + } + } + + new_idx = new_idx + 1 + run_path = file.path(path, paste0(prefix, "_", new_idx)) + + dir.create(run_path, recursive = TRUE) + } else { + run_path = file.path(path, prefix) + if (file.exists(run_path)) { + lapply(list.files(run_path, full.names = TRUE), file.remove) + } else{ + dir.create(run_path, recursive = TRUE) + } + } + + + # `configspace.json` + jsonlite::write_json( + get_configspace(instance), + file.path(run_path, "configspace.json"), + auto_unbox = TRUE, pretty = TRUE, null = "null" + ) + + # `configs.json` + jsonlite::write_json( + get_configs(instance), + file.path(run_path, "configs.json"), + auto_unbox = TRUE, pretty = TRUE, null = "null" + ) + + # `meta.json` + jsonlite::write_json( + get_meta(instance), + file.path(run_path, "meta.json"), + auto_unbox = TRUE, pretty = TRUE, null = "null" + ) + + # `history.jsonl` + con = file(file.path(run_path, "history.jsonl"), open = "w") + jsonlite::stream_out( + get_history(instance), + con, + # objectives must be a list, so do not auto unbox if a list has only one entry + auto_unbox = FALSE, + pretty = TRUE, null = "list", na = "null", + dataframe = "values", + verbose = FALSE + ) + close(con) + + # `origins.json` (a list of `null`s) + origins = rep(list(NULL), nrow(instance$archive$data)) + names(origins) = seq(nrow(instance$archive$data)) - 1 + jsonlite::write_json( + origins, + file.path(run_path, "origins.json"), + pretty = TRUE, null = "null" + ) +} + + +# Prepare the list for converting to `configs.json` +get_configs = function(instance){ + param_ids = instance$search_space$data$id + + # skip branch.selection if there is only one level + id = NULL # resolve global variable note in R CDM check + nbranches = instance$search_space$data[id == "branch.selection", "nlevels", with = FALSE] + if (nbranches == 1) { + param_ids = setdiff(param_ids, "branch.selection") + } + + config_table = instance$archive$data[, param_ids, with = FALSE] + # param values in deepcave are on the original scale, not the log scale + logscale_params = param_ids[instance$search_space$is_logscale[param_ids]] + config_table[, (logscale_params) := lapply(.SD, exp), .SDcols = logscale_params] + + configs_list = map(seq_len(nrow(config_table)), function(i) { + discard(as.list(config_table[i, ]), is.na) + }) + names(configs_list) = seq_along(configs_list) - 1 + + return(configs_list) +} + + +# Prepare the list for converting to `configspace.json` +get_configspace = function(instance) { + param_ids = instance$search_space$data$id + + hyperparameters_list = map(param_ids, function(param_id) { + id = NULL # resolve global variable note in R CDM check + row = instance$search_space$data[id == param_id, ] + + # skip branch.selection if there is only one branch + if (param_id == "branch.selection" && row[["nlevels"]] == 1) return() + + type = switch(row[["class"]], + ParamFct = "categorical", + ParamLgl = "categorical", + ParamDbl = "uniform_float", + ParamInt = "uniform_int") + + # categorical params + if (type == "categorical") { + choices = unlist(row[["levels"]]) + return(list( + name = param_id, + type = type, + choices = choices, + # FIXME: `default` is wrong + default = choices[[1]], + probabilities = NULL + )) + } + + # int / float params + is_logscale = instance$search_space$is_logscale[[param_id]] + lower = row[["lower"]] + upper = row[["upper"]] + # FIXME: default is wrong + default = lower + if (is_logscale) { + lower = exp(lower) + upper = exp(upper) + default = exp(default) + } + return(list( + name = param_id, + type = type, + log = is_logscale, + lower = lower, + upper = upper, + default = default, + q = NULL + )) + }) + # skipping branch.selection results in null entries => discard them + hyperparameters_list = discard(hyperparameters_list, is.null) + + + conditions_list = map(setdiff(param_ids, "branch.selection"), function(param_id) { + id = NULL # resolve global variable note in R CDM check + dependency = instance$search_space$deps[id == param_id, ] + # `svm.degree` and `svm.gamma` depends on `svm.kernel` as well as `branch.selection`. + # DeepCAVE does not allow one parameter to be conditioned on multiple others. + # So remove their dependency on `branch.selection`. + if (nrow(dependency) > 1) { + on = NULL # resolve global variable note in R CDM check + dependency = dependency[on != "branch.selection", ] + } + child = param_id + parent = dependency[["on"]] + + # remove dependency on branch.selection if there is only one branch + nbranches = instance$search_space$data[id == "branch.selection", "nlevels", with = FALSE] + if (parent == "branch.selection" && nbranches == 1) return() + + # `cond` below is a list of `Condition`s. + # Currently, there are only 'CondEqual' and 'CondAnyOf', which should not be used simultaneously. + # So this list should always contain only one entry. + cond = dependency[["cond"]][[1]] + if (class(cond)[[1]] == "CondEqual") { + return(list(child = child, parent = parent, type = "EQ", value = cond$rhs)) + } + return(list(child = child, parent = parent, type = "IN", values = cond$rhs)) + }) + # skipping branch.selection results in null entries => discard them + conditions_list = discard(conditions_list, is.null) + + return(list( + hyperparameters = hyperparameters_list, + conditions = conditions_list, + forbiddens = list() + )) +} + +# Prepare the data.table for converting to `history.jsonl` +get_history = function(instance) { + costs = instance$objective$codomain$data$id + + selected_cols = c(costs, "timestamp_xs", "timestamp_ys", "state") + timestamp_xs = timestamp_ys = state = NULL # resolve global variable note in R CDM check + history_table = instance$archive$data[, selected_cols, with = FALSE][, list( + config_id = seq_len(nrow(instance$archive$data)) - 1, + budget = 0, + seed = -1, + # combine costs into a list column + costs = lapply(transpose(.SD), c), + # handle start and end time (time elapsed since first timestamp) + # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/recorder.py + # start and end time here might having different meanings than the original implementation + start_time = as.numeric(timestamp_xs - timestamp_xs[1]), + end_time = as.numeric(timestamp_ys - timestamp_ys[1]), + # state is either "finished" <=> SUCESS = 1 or ABORTED = 5 + # see https://github.com/automl/DeepCAVE/blob/main/deepcave/runs/status.py + state = ifelse(state == "finished", 1, 5), + additionals = list() + ), .SDcols = costs] + + return(history_table) +} + + +# Prepare the list for converting to 'meta.json' +get_meta = function(instance){ + costs = instance$objective$codomain$data$id + + objectives_list = map(costs, function(cost) { + measure = msr(cost) + + lower = measure$range[[1]] + if (is.finite(lower)) { + lock_lower = TRUE + } else { + lower = min(instance$archive$data[, cost, with = FALSE]) + lock_lower = FALSE + } + + upper = measure$range[[2]] + if (is.finite(upper)) { + lock_upper = TRUE + } else { + upper = max(instance$archive$data[, cost, with = FALSE]) + lock_upper = FALSE + } + + optimize = if (measure$minimize) { + "lower" + } else { + "upper" + } + + return(list(name = cost, lower = lower, upper = upper, + lock_lower = lock_lower, lock_upper = lock_upper, optimize = optimize)) + }) + + return(list( + objectives = objectives_list, + budgets = list(0), + seeds = list(-1) + )) +} diff --git a/man/save_deepcave_run.Rd b/man/save_deepcave_run.Rd new file mode 100644 index 0000000..61ca04d --- /dev/null +++ b/man/save_deepcave_run.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/save_deepcave_run.R +\name{save_deepcave_run} +\alias{save_deepcave_run} +\title{Save Tuning History as a DeepCAVE Run} +\usage{ +save_deepcave_run( + instance, + path = "logs/mlr3automl", + prefix = "run", + overwrite = FALSE +) +} +\arguments{ +\item{instance}{(\link{TuningInstanceAsyncSingleCrit}) +Tuning instance to save.} + +\item{path}{(\code{character(1)}) +Path to save the run. Defaults to \verb{"logs/mlr3automl}.} + +\item{prefix}{(\code{character(1)}) +Prefix for the name of a new subfolder under \code{path} for storing the current run.} + +\item{overwrite}{(\code{character(1)}) +If \code{FALSE} (default), creates a new subfolder to save the current run. If \code{TRUE}, all existing runs will be deleted.} +} +\description{ +Exports information stored in a \code{TuningInstance} in a format recognized by \href{https://automl.github.io/DeepCAVE/main/index.html}{DeepCAVE} as a run. Each run is stored as a folder containing five files \code{configs.json}, \code{configspace.json}, \code{history.jsonl}, \code{meta.json}, and \code{origins.json}. +} +\examples{ +\dontrun{ +rush_plan(n_workers = 2) +task = tsk("penguins") + +learner1 = lrn("classif.auto", + learner_ids = c("svm", "ranger"), + small_data_size = 1, + resampling = rsmp("holdout"), + measure = msr("classif.ce"), + terminator = trm("evals", n_evals = 6) +) +learner1$train(task) +# save to `logs/mlr3automl/run_1` +save_deepcave_run(learner1$instance) + +# save to `logs/mlr3automl/run` +# if this folder already exists, it will be overwritten +save_deepcave_run(learner1$instance, overwrite = TRUE) + +learner2 = lrn("classif.auto", + learner_ids = c("catboost", "xgboost"), + small_data_size = 1, + resampling = rsmp("holdout"), + measure = msr("classif.ce"), + terminator = trm("evals", n_evals = 6) +) +learner2$train(task) +# save to `logs/mlr3automl/run_2` +save_deepcave_run(learner2$instance) +} +} diff --git a/tests/testthat/test_save_deepcave_run.R b/tests/testthat/test_save_deepcave_run.R new file mode 100644 index 0000000..4a3f85b --- /dev/null +++ b/tests/testthat/test_save_deepcave_run.R @@ -0,0 +1,64 @@ +test_that("run is saved without overwriting", { + dir = tempdir() + + previous_run = file.path(dir, "test-1-run_1") + dir.create(previous_run, showWarnings = FALSE) + + rush_plan(n_workers = 2) + skip_if_not_installed("e1071") + + task = tsk("penguins") + learner = lrn("classif.auto", + learner_ids = "svm", + small_data_size = 1, + resampling = rsmp("holdout"), + measure = msr("classif.ce"), + terminator = trm("evals", n_evals = 6) + ) + learner$train(task) + + expected_path = file.path(dir, "test-1-run_2") + if (file.exists(expected_path)) { + lapply(list.files(expected_path, full.names = TRUE), file.remove) + file.remove(expected_path) + } + + save_deepcave_run(learner$instance, path = dir, prefix = "test-1-run", overwrite = FALSE) + + expect_file_exists(file.path(expected_path, "configspace.json")) + expect_file_exists(file.path(expected_path, "configs.json")) + expect_file_exists(file.path(expected_path, "history.jsonl")) + expect_file_exists(file.path(expected_path, "meta.json")) + expect_file_exists(file.path(expected_path, "origins.json")) +}) + +test_that("overwriting works", { + dir = tempdir() + expected_path = file.path(dir, "test-run-overwrite") + file.create(file.path(expected_path, "configs.json"), showWarnings = FALSE) + file.create(file.path(expected_path, "configspace.json"), showWarnings = FALSE) + file.create(file.path(expected_path, "history.jsonl"), showWarnings = FALSE) + file.create(file.path(expected_path, "meta.json"), showWarnings = FALSE) + file.create(file.path(expected_path, "origins.json"), showWarnings = FALSE) + + rush_plan(n_workers = 2) + skip_if_not_installed("e1071") + + task = tsk("penguins") + learner = lrn("classif.auto", + learner_ids = "svm", + small_data_size = 1, + resampling = rsmp("holdout"), + measure = msr("classif.ce"), + terminator = trm("evals", n_evals = 6) + ) + learner$train(task) + + save_deepcave_run(learner$instance, path = dir, prefix = "test-run-overwrite", overwrite = TRUE) + + expect_file_exists(file.path(expected_path, "configspace.json")) + expect_file_exists(file.path(expected_path, "configs.json")) + expect_file_exists(file.path(expected_path, "history.jsonl")) + expect_file_exists(file.path(expected_path, "meta.json")) + expect_file_exists(file.path(expected_path, "origins.json")) +})