diff --git a/common.hpp b/common.hpp index d9c823df0..8b67815cd 100644 --- a/common.hpp +++ b/common.hpp @@ -3,6 +3,10 @@ #include "ggml_extend.hpp" +#ifdef SD_USE_VULKAN +#include "ggml-vulkan.h" +#endif + class DownSampleBlock : public GGMLBlock { protected: int channels; @@ -248,9 +252,6 @@ class FeedForward : public GGMLBlock { float scale = 1.f; if (precision_fix) { scale = 1.f / 128.f; -#ifdef SD_USE_VULKAN - force_prec_f32 = true; -#endif } // The purpose of the scale here is to prevent NaN issues in certain situations. // For example, when using Vulkan without enabling force_prec_f32, @@ -264,6 +265,11 @@ class FeedForward : public GGMLBlock { auto net_0 = std::dynamic_pointer_cast(blocks["net.0"]); auto net_2 = std::dynamic_pointer_cast(blocks["net.2"]); + #ifdef SD_USE_VULKAN + if(ggml_backend_is_vk(ctx->backend)){ + net_2->set_force_prec_f32(true); + } + #endif x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim] x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out] diff --git a/conditioner.hpp b/conditioner.hpp index a4e84aa3b..238c28c05 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -2,8 +2,11 @@ #define __CONDITIONER_HPP__ #include "clip.hpp" +#include "ggml-alloc.h" +#include "ggml-backend.h" #include "llm.hpp" #include "t5.hpp" +#include "util.h" struct SDCondition { struct ggml_tensor* c_crossattn = nullptr; // aka context @@ -62,7 +65,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { std::vector token_embed_custom; std::map> embedding_pos_map; - FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend, + FrozenCLIPEmbedderWithCustomWords(std::vector backends, bool offload_params_to_cpu, const String2TensorStorage& tensor_storage_map, const std::map& orig_embedding_map, @@ -76,13 +79,27 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { tokenizer.add_special_token(name); } bool force_clip_f32 = !embedding_map.empty(); + + ggml_backend_t clip_backend = backends[0]; + if (sd_version_is_sd1(version)) { - text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32); + LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_backend)); + text_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32); } else if (sd_version_is_sd2(version)) { - text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32); + LOG_INFO("CLIP-H: using %s backend", ggml_backend_name(clip_backend)); + text_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32); } else if (sd_version_is_sdxl(version)) { - text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32); - text_model2 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32); + ggml_backend_t clip_g_backend = clip_backend; + if (backends.size() >= 2){ + clip_g_backend = backends[1]; + if (backends.size() > 2) { + LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest."); + } + } + LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_backend)); + LOG_INFO("CLIP-G: using %s backend", ggml_backend_name(clip_g_backend)); + text_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32); + text_model2 = std::make_shared(clip_g_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32); } } @@ -702,13 +719,29 @@ struct SD3CLIPEmbedder : public Conditioner { std::shared_ptr clip_g; std::shared_ptr t5; - SD3CLIPEmbedder(ggml_backend_t backend, + SD3CLIPEmbedder(std::vector backends, bool offload_params_to_cpu, const String2TensorStorage& tensor_storage_map = {}) : clip_g_tokenizer(0) { bool use_clip_l = false; bool use_clip_g = false; bool use_t5 = false; + + ggml_backend_t clip_l_backend, clip_g_backend, t5_backend; + if (backends.size() == 1) { + clip_l_backend = clip_g_backend = t5_backend = backends[0]; + } else if (backends.size() == 2) { + clip_l_backend = clip_g_backend = backends[0]; + t5_backend = backends[1]; + } else if (backends.size() >= 3) { + clip_l_backend = backends[0]; + clip_g_backend = backends[1]; + t5_backend = backends[2]; + if (backends.size() > 3) { + LOG_WARN("More than 3 clip backends provided, but the model only supports 3 text encoders. Ignoring the rest."); + } + } + for (auto pair : tensor_storage_map) { if (pair.first.find("text_encoders.clip_l") != std::string::npos) { use_clip_l = true; @@ -723,13 +756,16 @@ struct SD3CLIPEmbedder : public Conditioner { return; } if (use_clip_l) { - clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); + LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_l_backend)); + clip_l = std::make_shared(clip_l_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); } if (use_clip_g) { - clip_g = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); + LOG_INFO("CLIP-G: using %s backend", ggml_backend_name(clip_g_backend)); + clip_g = std::make_shared(clip_g_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); } if (use_t5) { - t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer"); + LOG_INFO("T5-XXL: using %s backend", ggml_backend_name(t5_backend)); + t5 = std::make_shared(t5_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer"); } } @@ -1123,11 +1159,25 @@ struct FluxCLIPEmbedder : public Conditioner { std::shared_ptr t5; size_t chunk_len = 256; - FluxCLIPEmbedder(ggml_backend_t backend, + FluxCLIPEmbedder(std::vector backends, bool offload_params_to_cpu, const String2TensorStorage& tensor_storage_map = {}) { bool use_clip_l = false; bool use_t5 = false; + + + ggml_backend_t clip_l_backend, t5_backend; + if (backends.size() == 1) { + clip_l_backend = t5_backend = backends[0]; + } else if (backends.size() >= 2) { + clip_l_backend = backends[0]; + t5_backend = backends[1]; + if (backends.size() > 2) { + LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest."); + } + } + + for (auto pair : tensor_storage_map) { if (pair.first.find("text_encoders.clip_l") != std::string::npos) { use_clip_l = true; @@ -1142,12 +1192,14 @@ struct FluxCLIPEmbedder : public Conditioner { } if (use_clip_l) { - clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); + LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_l_backend)); + clip_l = std::make_shared(clip_l_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); } else { LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded."); } if (use_t5) { - t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer"); + LOG_INFO("T5-XXL: using %s backend", ggml_backend_name(clip_l_backend)); + t5 = std::make_shared(t5_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer"); } else { LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded."); } diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index ab58ab5f0..eb79a51cc 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -46,6 +46,7 @@ struct SDCliParams { bool color = false; bool normal_exit = false; + bool skip_usage = false; ArgOptions get_options() { ArgOptions options; @@ -143,7 +144,27 @@ struct SDCliParams { auto on_help_arg = [&](int argc, const char** argv, int index) { normal_exit = true; - return -1; + return VALID_BREAK_OPT; + }; + + auto on_rpc_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* rpc_device = argv[index]; + add_rpc_device(rpc_device); + return 1; + }; + + auto on_list_devices_arg = [&](int argc, const char** argv, int index) { + size_t buff_size = backend_list_size(); + char* buff = (char*)malloc(buff_size); + list_backends_to_buffer(buff, buff_size); + printf("List of available GGML devices:\nName\tDescription\n-------------------\n%s\n", buff); + free(buff); + normal_exit = true; + skip_usage = true; + return VALID_BREAK_OPT; }; options.manual_options = { @@ -159,6 +180,14 @@ struct SDCliParams { "--help", "show this help message and exit", on_help_arg}, + {"", + "--rpc", + "add a rpc device", + on_rpc_arg}, + {"", + "--list-devices", + "list available ggml compute devices", + on_list_devices_arg}, }; return options; @@ -213,7 +242,9 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP std::vector options_vec = {cli_params.get_options(), ctx_params.get_options(), gen_params.get_options()}; if (!parse_options(argc, argv, options_vec)) { - print_usage(argc, argv, options_vec); + if (!cli_params.skip_usage){ + print_usage(argc, argv, options_vec); + } exit(cli_params.normal_exit ? 0 : 1); } @@ -783,7 +814,8 @@ int main(int argc, const char* argv[]) { ctx_params.offload_params_to_cpu, ctx_params.diffusion_conv_direct, ctx_params.n_threads, - gen_params.upscale_tile_size); + gen_params.upscale_tile_size, + ctx_params.upscaler_backend_device.c_str()); if (upscaler_ctx == nullptr) { LOG_ERROR("new_upscaler_ctx failed"); diff --git a/examples/common/common.hpp b/examples/common/common.hpp index ba1b0d8d9..960e57112 100644 --- a/examples/common/common.hpp +++ b/examples/common/common.hpp @@ -34,6 +34,8 @@ namespace fs = std::filesystem; #define SAFE_STR(s) ((s) ? (s) : "") #define BOOL_STR(b) ((b) ? "true" : "false") +#define VALID_BREAK_OPT -42 + const char* modes_str[] = { "img_gen", "vid_gen", @@ -401,16 +403,26 @@ static bool parse_options(int argc, const char** argv, const std::vector embedding_map; std::vector embedding_vec; @@ -454,9 +476,6 @@ struct SDContextParams { rng_type_t sampler_rng_type = RNG_TYPE_COUNT; bool offload_params_to_cpu = false; bool enable_mmap = false; - bool control_net_cpu = false; - bool clip_on_cpu = false; - bool vae_on_cpu = false; bool diffusion_flash_attn = false; bool diffusion_conv_direct = false; bool vae_conv_direct = false; @@ -561,6 +580,43 @@ struct SDContextParams { "--upscale-model", "path to esrgan model.", &esrgan_path}, + {"", + "--main-backend-device", + "default device to use for all backends (defaults to main gpu device if hardware acceleration is available, otherwise cpu)", + &main_backend_device}, + {"", + "--diffusion-backend-device", + "device to use for diffusion (defaults to main-backend-device)", + &diffusion_backend_device}, + {"", + "--clip-backend-device", + "device to use for clip (defaults to main-backend-device)", + &clip_backend_device}, + {"", + "--vae-backend-device", + "device to use for vae (defaults to main-backend-device). Also applies to tae, unless tae-backend-device is specified", + &vae_backend_device}, + {"", + "--tae-backend-device", + "device to use for tae (defaults to vae-backend-device)", + &tae_backend_device}, + {"", + "--control-net-backend-device", + "device to use for control net (defaults to main-backend-device)", + &control_net_backend_device}, + {"", + "--upscaler-backend-device", + "device to use for upscaling models (defaults to main-backend-device)", + &upscaler_backend_device}, + {"", + "--photomaker-backend-device", + "device to use for photomaker (defaults to main-backend-device)", + &photomaker_backend_device}, + {"", + "--vision-backend-device", + "device to use for clip-vision model (defaults to main-backend-device)", + &vision_backend_device}, + }; options.int_options = { @@ -603,18 +659,6 @@ struct SDContextParams { "--mmap", "whether to memory-map model", true, &enable_mmap}, - {"", - "--control-net-cpu", - "keep controlnet in cpu (for low vram)", - true, &control_net_cpu}, - {"", - "--clip-on-cpu", - "keep clip in cpu (for low vram)", - true, &clip_on_cpu}, - {"", - "--vae-on-cpu", - "keep vae in cpu (for low vram)", - true, &vae_on_cpu}, {"", "--diffusion-fa", "use flash attention in the diffusion model", @@ -875,6 +919,7 @@ struct SDContextParams { std::string embeddings_str = emb_ss.str(); std::ostringstream oss; + // TODO backend devices oss << "SDContextParams {\n" << " n_threads: " << n_threads << ",\n" << " model_path: \"" << model_path << "\",\n" @@ -901,9 +946,9 @@ struct SDContextParams { << " flow_shift: " << (std::isinf(flow_shift) ? "INF" : std::to_string(flow_shift)) << "\n" << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" - << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" - << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" - << " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n" + // << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" + // << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" + // << " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n" << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" << " vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n" @@ -965,9 +1010,6 @@ struct SDContextParams { lora_apply_mode, offload_params_to_cpu, enable_mmap, - clip_on_cpu, - control_net_cpu, - vae_on_cpu, diffusion_flash_attn, taesd_preview, diffusion_conv_direct, @@ -980,6 +1022,14 @@ struct SDContextParams { chroma_t5_mask_pad, qwen_image_zero_cond_t, flow_shift, + main_backend_device.c_str(), + diffusion_backend_device.c_str(), + clip_backend_device.c_str(), + vae_backend_device.c_str(), + tae_backend_device.c_str(), + control_net_backend_device.c_str(), + photomaker_backend_device.c_str(), + vision_backend_device.c_str(), }; return sd_ctx_params; } diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 7dac03738..2a587549e 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -28,26 +28,6 @@ #include "model.h" -#ifdef SD_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef SD_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef SD_USE_VULKAN -#include "ggml-vulkan.h" -#endif - -#ifdef SD_USE_OPENCL -#include "ggml-opencl.h" -#endif - -#ifdef SD_USE_SYCL -#include "ggml-sycl.h" -#endif - #include "rng.hpp" #include "util.h" @@ -88,6 +68,42 @@ __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const cha } } +__STATIC_INLINE__ bool backend_name_exists(std::string name) { + const int device_count = ggml_backend_dev_count(); + for (int i = 0; i < device_count; i++) { + if (name == ggml_backend_dev_name(ggml_backend_dev_get(i))) { + return true; + } + } + return false; +} + +__STATIC_INLINE__ std::string sanitize_backend_name(std::string name) { + if (name == "" || backend_name_exists(name)) { + return name; + } else { + LOG_WARN("Backend %s not found, using default backend", name.c_str()); + return ""; + } +} + +__STATIC_INLINE__ std::string get_default_backend_name() { + // should pick the same backend as ggml_backend_init_best + ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU); + dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU); + dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + return ggml_backend_dev_name(dev); +} + +__STATIC_INLINE__ ggml_backend_t init_named_backend(std::string name = "") { + LOG_DEBUG("Initializing backend: %s", name.c_str()); + if (name.empty()) { + return ggml_backend_init_best(); + } else { + return ggml_backend_init_by_name(name.c_str(), nullptr); + } +} + static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128"); // n-mode tensor-matrix product @@ -2220,6 +2236,14 @@ class Linear : public UnaryBlock { force_prec_f32(force_prec_f32), scale(scale) {} + void set_scale(float scale_){ + scale = scale_; + } + + void set_force_prec_f32(bool force_prec_f32_){ + force_prec_f32 = force_prec_f32_; + } + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; struct ggml_tensor* b = nullptr; diff --git a/model.cpp b/model.cpp index 253dd25cd..786b8c739 100644 --- a/model.cpp +++ b/model.cpp @@ -29,18 +29,6 @@ #include "name_conversion.h" #include "stable-diffusion.h" -#ifdef SD_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef SD_USE_VULKAN -#include "ggml-vulkan.h" -#endif - -#ifdef SD_USE_OPENCL -#include "ggml-opencl.h" -#endif - #define ST_HEADER_SIZE_LEN 8 uint64_t read_u64(uint8_t* buffer) { diff --git a/qwen_image.hpp b/qwen_image.hpp index dfa539788..87952ef2d 100644 --- a/qwen_image.hpp +++ b/qwen_image.hpp @@ -7,6 +7,10 @@ #include "flux.hpp" #include "ggml_extend.hpp" +#ifdef SD_USE_VULKAN +#include "ggml-vulkan.h" +#endif + namespace Qwen { constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480; @@ -96,9 +100,7 @@ namespace Qwen { float scale = 1.f / 32.f; bool force_prec_f32 = false; -#ifdef SD_USE_VULKAN - force_prec_f32 = true; -#endif + // The purpose of the scale here is to prevent NaN issues in certain situations. // For example when using CUDA but the weights are k-quants (not all prompts). blocks["to_out.0"] = std::shared_ptr(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale)); @@ -124,6 +126,11 @@ namespace Qwen { auto to_k = std::dynamic_pointer_cast(blocks["to_k"]); auto to_v = std::dynamic_pointer_cast(blocks["to_v"]); auto to_out_0 = std::dynamic_pointer_cast(blocks["to_out.0"]); +#ifdef SD_USE_VULKAN + if(ggml_backend_is_vk(ctx->backend)){ + to_out_0->set_force_prec_f32(true); + } +#endif auto norm_added_q = std::dynamic_pointer_cast(blocks["norm_added_q"]); auto norm_added_k = std::dynamic_pointer_cast(blocks["norm_added_k"]); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index b181f994b..60e4c6338 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1,3 +1,4 @@ +#include "ggml-cpu.h" #include "ggml_extend.hpp" #include "model.h" @@ -5,6 +6,7 @@ #include "rng_mt19937.hpp" #include "rng_philox.hpp" #include "stable-diffusion.h" +#include #include "util.h" #include "cache_dit.hpp" @@ -94,14 +96,129 @@ void suppress_pp(int step, int steps, float time, void* data) { return; } +std::vector string_split(const std::string & input, char separator) +{ + std::vector parts; + size_t begin_pos = 0; + size_t separator_pos = input.find(separator); + while (separator_pos != std::string::npos) { + std::string part = input.substr(begin_pos, separator_pos - begin_pos); + parts.emplace_back(part); + begin_pos = separator_pos + 1; + separator_pos = input.find(separator, begin_pos); + } + parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos)); + return parts; +} + +static void add_rpc_devices(const std::string & servers) { + auto rpc_servers = string_split(servers, ','); + if (rpc_servers.empty()) { + throw std::invalid_argument("no RPC servers specified"); + } + ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC"); + if (!rpc_reg) { + throw std::invalid_argument("failed to find RPC backend"); + } + typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char * endpoint); + ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server"); + if (!ggml_backend_rpc_add_server_fn) { + throw std::invalid_argument("failed to find RPC add server function"); + } + for (const auto & server : rpc_servers) { + auto reg = ggml_backend_rpc_add_server_fn(server.c_str()); + ggml_backend_register(reg); + } +} + +void add_rpc_device(const char* servers_cstr){ + std::string servers(servers_cstr); + add_rpc_devices(servers); +} + +std::vector sanitize_backend_name_list(std::string name) { + std::vector vec = {}; + if (name == "" || backend_name_exists(name)) { + // single backend + vec.push_back(name); + } else if (name.find(",") != std::string::npos) { + // comma-separated backend names + std::stringstream ss(name); + std::string token; + while (std::getline(ss, token, ',')) { + if (token == "" || backend_name_exists(token)) { + vec.push_back(token); + } else { + LOG_WARN("backend name %s not found, using default", token.c_str()); + vec.push_back(""); + } + } + } else { + vec.push_back(""); + } + return vec; +} + +std::vector> list_backends_vector() { + std::vector> backends; + const int device_count = ggml_backend_dev_count(); + for (int i = 0; i < device_count; i++) { + auto dev = ggml_backend_dev_get(i); + backends.push_back({ggml_backend_dev_name(dev), ggml_backend_dev_description(dev)}); + } + return backends; +} + +SD_API size_t backend_list_size(){ + // for C API + size_t buffer_size = 0; + auto backends = list_backends_vector(); + for (auto& backend : backends) { + auto dev_name_size = backend.first.size(); + auto dev_desc_size = backend.second.size(); + buffer_size+=dev_name_size+dev_desc_size+2; // +2 for the separators + } + return buffer_size; +} + +// devices are separated by \n and name and description are separated by \t +SD_API void list_backends_to_buffer(char* buffer, size_t buffer_size) { + auto backends = list_backends_vector(); + size_t offset = 0; + for (auto& backend : backends) { + size_t name_size = backend.first.size(); + size_t desc_size = backend.second.size(); + if (offset + name_size + desc_size + 2 > buffer_size) { + break; // Not enough space in the buffer + } + memcpy(buffer + offset, backend.first.c_str(), name_size); + offset += name_size; + buffer[offset++] = '\t'; + memcpy(buffer + offset, backend.second.c_str(), desc_size); + offset += desc_size; + buffer[offset++] = '\n'; + } + if (offset < buffer_size) { + buffer[offset] = '\0'; // Ensure the buffer is null-terminated at the end + } else { + LOG_WARN("Provided buffer size is too small to contain details of all devices."); + buffer[buffer_size - 1] = '\0'; // Ensure the buffer is null-terminated at the end + } +} + /*=============================================== StableDiffusionGGML ================================================*/ class StableDiffusionGGML { public: ggml_backend_t backend = nullptr; // general backend - ggml_backend_t clip_backend = nullptr; + ggml_backend_t diffusion_backend = nullptr; ggml_backend_t control_net_backend = nullptr; ggml_backend_t vae_backend = nullptr; + ggml_backend_t tae_backend = nullptr; + ggml_backend_t pmid_backend = nullptr; + ggml_backend_t vision_backend = nullptr; + + std::vector clip_backends = {nullptr}; SDVersion version; bool vae_decode_only = false; @@ -147,72 +264,32 @@ class StableDiffusionGGML { StableDiffusionGGML() = default; ~StableDiffusionGGML() { - if (clip_backend != backend) { - ggml_backend_free(clip_backend); + if (diffusion_backend != backend) { + ggml_backend_free(diffusion_backend); + } + for(auto clip_backend : clip_backends) { + if (clip_backend != backend) { + ggml_backend_free(clip_backend); + } } if (control_net_backend != backend) { ggml_backend_free(control_net_backend); } + if (tae_backend != vae_backend) { + ggml_backend_free(tae_backend); + } if (vae_backend != backend) { ggml_backend_free(vae_backend); } ggml_backend_free(backend); } - void init_backend() { -#ifdef SD_USE_CUDA - LOG_DEBUG("Using CUDA backend"); - backend = ggml_backend_cuda_init(0); -#endif -#ifdef SD_USE_METAL - LOG_DEBUG("Using Metal backend"); - backend = ggml_backend_metal_init(); -#endif -#ifdef SD_USE_VULKAN - LOG_DEBUG("Using Vulkan backend"); - size_t device = 0; - const int device_count = ggml_backend_vk_get_device_count(); - if (device_count) { - const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE"); - if (SD_VK_DEVICE != nullptr) { - std::string sd_vk_device_str = SD_VK_DEVICE; - try { - device = std::stoull(sd_vk_device_str); - } catch (const std::invalid_argument&) { - LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to device 0.", SD_VK_DEVICE); - device = 0; - } catch (const std::out_of_range&) { - LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to device 0.", SD_VK_DEVICE); - device = 0; - } - if (device >= device_count) { - LOG_WARN("Cannot find targeted vulkan device (%llu). Falling back to device 0.", device); - device = 0; - } - } - LOG_INFO("Vulkan: Using device %llu", device); - backend = ggml_backend_vk_init(device); - } - if (!backend) { - LOG_WARN("Failed to initialize Vulkan backend"); - } -#endif -#ifdef SD_USE_OPENCL - LOG_DEBUG("Using OpenCL backend"); - // ggml_log_set(ggml_log_callback_default, nullptr); // Optional ggml logs - backend = ggml_backend_opencl_init(); - if (!backend) { - LOG_WARN("Failed to initialize OpenCL backend"); - } -#endif -#ifdef SD_USE_SYCL - LOG_DEBUG("Using SYCL backend"); - backend = ggml_backend_sycl_init(0); -#endif - if (!backend) { - LOG_DEBUG("Using CPU backend"); - backend = ggml_backend_cpu_init(); + void log_backends() { + const int device_count = ggml_backend_dev_count(); + for (int i = 0; i < device_count; i++) { + auto dev = ggml_backend_dev_get(i); + LOG_INFO("%s (%s)", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev)); } } @@ -243,7 +320,54 @@ class StableDiffusionGGML { ggml_log_set(ggml_log_callback_default, nullptr); - init_backend(); + log_backends(); + + std::string default_backend_name = get_default_backend_name(); + + std::string override_default_backend_name = sanitize_backend_name(SAFE_STR(sd_ctx_params->main_device)); + + if (override_default_backend_name.size() > 0) { + LOG_INFO("Setting default backend to %s", override_default_backend_name.c_str()); + default_backend_name = override_default_backend_name; + } + + std::string diffusion_backend_name = sanitize_backend_name(SAFE_STR(sd_ctx_params->diffusion_device)); + std::vector clip_backend_names = sanitize_backend_name_list(SAFE_STR(sd_ctx_params->clip_device)); + std::string control_net_backend_name = sanitize_backend_name(SAFE_STR(sd_ctx_params->control_net_device)); + std::string vae_backend_name = sanitize_backend_name(SAFE_STR(sd_ctx_params->vae_device)); + std::string tae_backend_name = sanitize_backend_name(SAFE_STR(sd_ctx_params->tae_device)); + std::string pmid_backend_name = sanitize_backend_name(SAFE_STR(sd_ctx_params->photomaker_device)); + std::string vision_backend_name = sanitize_backend_name(SAFE_STR(sd_ctx_params->vision_device)); + + bool diffusion_backend_is_default = diffusion_backend_name.empty() || diffusion_backend_name == default_backend_name; + bool clip_backends_are_default = true; + for (const auto& clip_backend_name : clip_backend_names) { + if (!clip_backend_name.empty() && clip_backend_name != default_backend_name) { + clip_backends_are_default = false; + break; + } + } + bool control_net_backend_is_default = (control_net_backend_name.empty() || control_net_backend_name == default_backend_name); + bool vae_backend_is_default = (vae_backend_name.empty() || vae_backend_name == default_backend_name); + // if tae_backend_name is empty, it will use the same backend as vae + bool tae_backend_is_default = (tae_backend_name.empty() && vae_backend_is_default) || tae_backend_name == default_backend_name; + bool pmid_backend_is_default = (pmid_backend_name.empty() || pmid_backend_name == default_backend_name); + bool vision_backend_is_default = (vision_backend_name.empty() || vision_backend_name == default_backend_name); + + // if some backend is not specified or is the same as the default backend, use the default backend + bool use_default_backend = diffusion_backend_is_default || clip_backends_are_default || control_net_backend_is_default || vae_backend_is_default || tae_backend_is_default || pmid_backend_is_default || vision_backend_is_default; + + if (use_default_backend) { + backend = init_named_backend(override_default_backend_name); + LOG_DEBUG("Loaded default backend %s", ggml_backend_name(backend)); + } + + if (!diffusion_backend_is_default) { + diffusion_backend = init_named_backend(diffusion_backend_name); + LOG_INFO("Using diffusion backend: %s", ggml_backend_name(diffusion_backend)); + } else { + diffusion_backend = backend; + } ModelLoader model_loader; @@ -419,21 +543,24 @@ class StableDiffusionGGML { LOG_INFO("Using circular padding for convolutions"); } - bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu; - { - clip_backend = backend; - if (clip_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_INFO("CLIP: Using CPU backend"); - clip_backend = ggml_backend_cpu_init(); + if (!clip_backends_are_default) { + clip_backends.clear(); + for(auto clip_backend_name : clip_backend_names){ + auto clip_backend = init_named_backend(clip_backend_name); + LOG_INFO("CLIP: Using %s backend", ggml_backend_name(clip_backend)); + clip_backends.push_back(clip_backend); + } + }else{ + clip_backends = {backend}; } if (sd_version_is_sd3(version)) { - cond_stage_model = std::make_shared(clip_backend, + cond_stage_model = std::make_shared(clip_backends, offload_params_to_cpu, tensor_storage_map); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map); + diffusion_model = std::make_shared(diffusion_backend, + offload_params_to_cpu, + tensor_storage_map); } else if (sd_version_is_flux(version)) { bool is_chroma = false; for (auto pair : tensor_storage_map) { @@ -452,53 +579,53 @@ class StableDiffusionGGML { "--chroma-disable-dit-mask as a workaround."); } - cond_stage_model = std::make_shared(clip_backend, + cond_stage_model = std::make_shared(clip_backends[0], offload_params_to_cpu, tensor_storage_map, sd_ctx_params->chroma_use_t5_mask, sd_ctx_params->chroma_t5_mask_pad); } else if (version == VERSION_OVIS_IMAGE) { - cond_stage_model = std::make_shared(clip_backend, + cond_stage_model = std::make_shared(clip_backends[0], offload_params_to_cpu, tensor_storage_map, version, "", false); } else { - cond_stage_model = std::make_shared(clip_backend, + cond_stage_model = std::make_shared(clip_backends, offload_params_to_cpu, tensor_storage_map); } - diffusion_model = std::make_shared(backend, + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, version, sd_ctx_params->chroma_use_dit_mask); } else if (sd_version_is_flux2(version)) { bool is_chroma = false; - cond_stage_model = std::make_shared(clip_backend, + cond_stage_model = std::make_shared(clip_backends[0], offload_params_to_cpu, tensor_storage_map, version); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - version, - sd_ctx_params->chroma_use_dit_mask); + diffusion_model = std::make_shared(diffusion_backend, + offload_params_to_cpu, + tensor_storage_map, + version, + sd_ctx_params->chroma_use_dit_mask); } else if (sd_version_is_wan(version)) { - cond_stage_model = std::make_shared(clip_backend, + cond_stage_model = std::make_shared(clip_backends[0], offload_params_to_cpu, tensor_storage_map, true, 1, true); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "model.diffusion_model", - version); + diffusion_model = std::make_shared(diffusion_backend, + offload_params_to_cpu, + tensor_storage_map, + "model.diffusion_model", + version); if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { - high_noise_diffusion_model = std::make_shared(backend, + high_noise_diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, "model.high_noise_diffusion_model", @@ -507,7 +634,7 @@ class StableDiffusionGGML { if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" || diffusion_model->get_desc() == "Wan2.1-FLF2V-14B" || diffusion_model->get_desc() == "Wan2.1-I2V-1.3B") { - clip_vision = std::make_shared(backend, + clip_vision = std::make_shared(vision_backend, offload_params_to_cpu, tensor_storage_map); clip_vision->alloc_params_buffer(); @@ -518,48 +645,48 @@ class StableDiffusionGGML { if (!vae_decode_only) { enable_vision = true; } - cond_stage_model = std::make_shared(clip_backend, + cond_stage_model = std::make_shared(clip_backends[0], offload_params_to_cpu, tensor_storage_map, version, "", enable_vision); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "model.diffusion_model", - version, - sd_ctx_params->qwen_image_zero_cond_t); + diffusion_model = std::make_shared(diffusion_backend, + offload_params_to_cpu, + tensor_storage_map, + "model.diffusion_model", + version, + sd_ctx_params->qwen_image_zero_cond_t); } else if (sd_version_is_z_image(version)) { - cond_stage_model = std::make_shared(clip_backend, + cond_stage_model = std::make_shared(clip_backends[0], offload_params_to_cpu, tensor_storage_map, version); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "model.diffusion_model", - version); + diffusion_model = std::make_shared(diffusion_backend, + offload_params_to_cpu, + tensor_storage_map, + "model.diffusion_model", + version); } else { // SD1.x SD2.x SDXL std::map embbeding_map; for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) { embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path)); } if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { - cond_stage_model = std::make_shared(clip_backend, + cond_stage_model = std::make_shared(clip_backends, offload_params_to_cpu, tensor_storage_map, embbeding_map, version, PM_VERSION_2); } else { - cond_stage_model = std::make_shared(clip_backend, + cond_stage_model = std::make_shared(clip_backends, offload_params_to_cpu, tensor_storage_map, embbeding_map, version); } - diffusion_model = std::make_shared(backend, + diffusion_model = std::make_shared(diffusion_backend, offload_params_to_cpu, tensor_storage_map, version); @@ -592,11 +719,15 @@ class StableDiffusionGGML { high_noise_diffusion_model->get_param_tensors(tensors); } - if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_INFO("VAE Autoencoder: Using CPU backend"); - vae_backend = ggml_backend_cpu_init(); - } else { - vae_backend = backend; + vae_backend = backend; + if (!vae_backend_is_default) { + vae_backend = init_named_backend(vae_backend_name); + LOG_INFO("VAE Autoencoder: Using %s backend", ggml_backend_name(vae_backend)); + } + tae_backend = vae_backend; + if (tae_backend_name.length() > 0 && tae_backend_name != vae_backend_name) { + tae_backend = init_named_backend(tae_backend_name); + LOG_INFO("Tiny Autoencoder: Using %s backend", ggml_backend_name(tae_backend)); } if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) { @@ -639,14 +770,14 @@ class StableDiffusionGGML { } if (use_tiny_autoencoder || version == VERSION_SDXS) { if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) { - tae_first_stage = std::make_shared(vae_backend, + tae_first_stage = std::make_shared(tae_backend, offload_params_to_cpu, tensor_storage_map, "decoder", vae_decode_only, version); } else { - tae_first_stage = std::make_shared(vae_backend, + tae_first_stage = std::make_shared(tae_backend, offload_params_to_cpu, tensor_storage_map, "decoder.layers", @@ -664,14 +795,13 @@ class StableDiffusionGGML { } if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) { - ggml_backend_t controlnet_backend = nullptr; - if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_DEBUG("ControlNet: Using CPU backend"); - controlnet_backend = ggml_backend_cpu_init(); + if (!control_net_backend_is_default) { + control_net_backend = init_named_backend(control_net_backend_name); + LOG_INFO("ControlNet: Using %s backend", ggml_backend_name(control_net_backend)); } else { - controlnet_backend = backend; + control_net_backend = backend; } - control_net = std::make_shared(controlnet_backend, + control_net = std::make_shared(control_net_backend, offload_params_to_cpu, tensor_storage_map, version); @@ -680,9 +810,15 @@ class StableDiffusionGGML { control_net->set_conv2d_direct_enabled(true); } } - + pmid_backend = backend; + if (!pmid_backend_is_default) { + pmid_backend = init_named_backend(pmid_backend_name); + LOG_INFO("PhotoMaker: Using %s backend", ggml_backend_name(pmid_backend)); + } else { + pmid_backend = backend; + } if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { - pmid_model = std::make_shared(backend, + pmid_model = std::make_shared(pmid_backend, offload_params_to_cpu, tensor_storage_map, "pmid", @@ -690,14 +826,14 @@ class StableDiffusionGGML { PM_VERSION_2); LOG_INFO("using PhotoMaker Version 2"); } else { - pmid_model = std::make_shared(backend, + pmid_model = std::make_shared(pmid_backend, offload_params_to_cpu, tensor_storage_map, "pmid", version); } if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) { - pmid_lora = std::make_shared("pmid", backend, sd_ctx_params->photo_maker_path, "", version); + pmid_lora = std::make_shared("pmid", diffusion_backend, sd_ctx_params->photo_maker_path, "", version); auto lora_tensor_filter = [&](const std::string& tensor_name) { if (starts_with(tensor_name, "lora.model")) { return true; @@ -817,13 +953,15 @@ class StableDiffusionGGML { size_t total_params_ram_size = 0; size_t total_params_vram_size = 0; - if (ggml_backend_is_cpu(clip_backend)) { + + // TODO: split by individual text encoders + if (ggml_backend_is_cpu(clip_backends[0])) { total_params_ram_size += clip_params_mem_size + pmid_params_mem_size; } else { total_params_vram_size += clip_params_mem_size + pmid_params_mem_size; } - if (ggml_backend_is_cpu(backend)) { + if (ggml_backend_is_cpu(diffusion_backend)) { total_params_ram_size += unet_params_mem_size; } else { total_params_vram_size += unet_params_mem_size; @@ -849,7 +987,8 @@ class StableDiffusionGGML { total_params_vram_size / 1024.0 / 1024.0, total_params_ram_size / 1024.0 / 1024.0, clip_params_mem_size / 1024.0 / 1024.0, - ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM", + // TODO: split + ggml_backend_is_cpu(clip_backends[0]) ? "RAM" : "VRAM", unet_params_mem_size / 1024.0 / 1024.0, ggml_backend_is_cpu(backend) ? "RAM" : "VRAM", vae_params_mem_size / 1024.0 / 1024.0, @@ -857,7 +996,7 @@ class StableDiffusionGGML { control_net_params_mem_size / 1024.0 / 1024.0, ggml_backend_is_cpu(control_net_backend) ? "RAM" : "VRAM", pmid_params_mem_size / 1024.0 / 1024.0, - ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM"); + ggml_backend_is_cpu(pmid_backend) ? "RAM" : "VRAM"); } // init denoiser @@ -1052,8 +1191,15 @@ class StableDiffusionGGML { for (auto& kv : lora_state_diff) { int64_t t0 = ggml_time_ms(); - - auto lora = load_lora_model_from_file(kv.first, kv.second, backend); + // TODO: Fix that + bool are_clip_backends_compatible = true; + for (auto backend: clip_backends){ + are_clip_backends_compatible = are_clip_backends_compatible && (diffusion_backend==backend || ggml_backend_is_cpu(backend)); + } + if(!are_clip_backends_compatible){ + LOG_WARN("Diffusion models and text encoders are running on different backends. This may cause issues when immediately applying LoRAs."); + } + auto lora = load_lora_model_from_file(kv.first, kv.second, diffusion_backend); if (!lora || lora->lora_tensors.empty()) { continue; } @@ -1098,8 +1244,8 @@ class StableDiffusionGGML { for (auto& kv : lora_state_diff) { const std::string& lora_id = kv.first; float multiplier = kv.second; - - auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backend, lora_tensor_filter); + //TODO: split by model + auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backends[0], lora_tensor_filter); if (lora && !lora->lora_tensors.empty()) { lora->preprocess_lora_tensors(tensors); cond_stage_lora_models.push_back(lora); @@ -1131,7 +1277,7 @@ class StableDiffusionGGML { const std::string& lora_name = kv.first; float multiplier = kv.second; - auto lora = load_lora_model_from_file(lora_name, multiplier, backend, lora_tensor_filter); + auto lora = load_lora_model_from_file(lora_name, multiplier, diffusion_backend, lora_tensor_filter); if (lora && !lora->lora_tensors.empty()) { lora->preprocess_lora_tensors(tensors); diffusion_lora_models.push_back(lora); @@ -2893,9 +3039,6 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO; sd_ctx_params->offload_params_to_cpu = false; sd_ctx_params->enable_mmap = false; - sd_ctx_params->keep_clip_on_cpu = false; - sd_ctx_params->keep_control_net_on_cpu = false; - sd_ctx_params->keep_vae_on_cpu = false; sd_ctx_params->diffusion_flash_attn = false; sd_ctx_params->circular_x = false; sd_ctx_params->circular_y = false; @@ -2910,7 +3053,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { if (!buf) return nullptr; buf[0] = '\0'; - + // TODO devices snprintf(buf + strlen(buf), 4096 - strlen(buf), "model_path: %s\n" "clip_l_path: %s\n" @@ -2934,9 +3077,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "sampler_rng_type: %s\n" "prediction: %s\n" "offload_params_to_cpu: %s\n" - "keep_clip_on_cpu: %s\n" - "keep_control_net_on_cpu: %s\n" - "keep_vae_on_cpu: %s\n" "diffusion_flash_attn: %s\n" "circular_x: %s\n" "circular_y: %s\n" @@ -2965,9 +3105,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { sd_rng_type_name(sd_ctx_params->sampler_rng_type), sd_prediction_name(sd_ctx_params->prediction), BOOL_STR(sd_ctx_params->offload_params_to_cpu), - BOOL_STR(sd_ctx_params->keep_clip_on_cpu), - BOOL_STR(sd_ctx_params->keep_control_net_on_cpu), - BOOL_STR(sd_ctx_params->keep_vae_on_cpu), BOOL_STR(sd_ctx_params->diffusion_flash_attn), BOOL_STR(sd_ctx_params->circular_x), BOOL_STR(sd_ctx_params->circular_y), diff --git a/stable-diffusion.h b/stable-diffusion.h index 8f040d2bd..a10dd7d60 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -183,9 +183,9 @@ typedef struct { enum lora_apply_mode_t lora_apply_mode; bool offload_params_to_cpu; bool enable_mmap; - bool keep_clip_on_cpu; - bool keep_control_net_on_cpu; - bool keep_vae_on_cpu; + // bool keep_clip_on_cpu; + // bool keep_control_net_on_cpu; + // bool keep_vae_on_cpu; bool diffusion_flash_attn; bool tae_preview_only; bool diffusion_conv_direct; @@ -198,6 +198,14 @@ typedef struct { int chroma_t5_mask_pad; bool qwen_image_zero_cond_t; float flow_shift; + const char* main_device; + const char* diffusion_device; + const char* clip_device; + const char* vae_device; + const char* tae_device; + const char* control_net_device; + const char* photomaker_device; + const char* vision_device; } sd_ctx_params_t; typedef struct { @@ -377,7 +385,8 @@ SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, bool offload_params_to_cpu, bool direct, int n_threads, - int tile_size); + int tile_size, + const char * device); SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, @@ -403,6 +412,11 @@ SD_API bool preprocess_canny(sd_image_t image, SD_API const char* sd_commit(void); SD_API const char* sd_version(void); +SD_API size_t backend_list_size(void); +SD_API void list_backends_to_buffer(char* buffer, size_t buffer_size); + +SD_API void add_rpc_device(const char* address); + #ifdef __cplusplus } #endif diff --git a/upscaler.cpp b/upscaler.cpp index 29ac981e6..ea198f166 100644 --- a/upscaler.cpp +++ b/upscaler.cpp @@ -22,37 +22,20 @@ struct UpscalerGGML { bool load_from_file(const std::string& esrgan_path, bool offload_params_to_cpu, - int n_threads) { + int n_threads, + std::string device = "") { ggml_log_set(ggml_log_callback_default, nullptr); -#ifdef SD_USE_CUDA - LOG_DEBUG("Using CUDA backend"); - backend = ggml_backend_cuda_init(0); -#endif -#ifdef SD_USE_METAL - LOG_DEBUG("Using Metal backend"); - backend = ggml_backend_metal_init(); -#endif -#ifdef SD_USE_VULKAN - LOG_DEBUG("Using Vulkan backend"); - backend = ggml_backend_vk_init(0); -#endif -#ifdef SD_USE_OPENCL - LOG_DEBUG("Using OpenCL backend"); - backend = ggml_backend_opencl_init(); -#endif -#ifdef SD_USE_SYCL - LOG_DEBUG("Using SYCL backend"); - backend = ggml_backend_sycl_init(0); -#endif + device = sanitize_backend_name(device); + backend = init_named_backend(device); ModelLoader model_loader; if (!model_loader.init_from_file_and_convert_name(esrgan_path)) { LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str()); } model_loader.set_wtype_override(model_data_type); - if (!backend) { - LOG_DEBUG("Using CPU backend"); - backend = ggml_backend_cpu_init(); - } + // if (!backend) { + // LOG_DEBUG("Using CPU backend"); + // backend = ggml_backend_cpu_init(); + // } LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); esrgan_upscaler = std::make_shared(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map()); if (direct) { @@ -117,7 +100,8 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str, bool offload_params_to_cpu, bool direct, int n_threads, - int tile_size) { + int tile_size, + const char* device) { upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t)); if (upscaler_ctx == nullptr) { return nullptr; @@ -129,7 +113,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str, return nullptr; } - if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) { + if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads, SAFE_STR(device))) { delete upscaler_ctx->upscaler; upscaler_ctx->upscaler = nullptr; free(upscaler_ctx); diff --git a/z_image.hpp b/z_image.hpp index cee23833a..ef1e48dff 100644 --- a/z_image.hpp +++ b/z_image.hpp @@ -7,6 +7,14 @@ #include "ggml_extend.hpp" #include "mmdit.hpp" +#ifdef SD_USE_VULKAN +#include "ggml-vulkan.h" +#endif + +#if GGML_USE_HIP +#include "ggml-cuda.h" +#endif + // Ref: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/models/model.py // Ref: https://github.com/huggingface/diffusers/pull/12703 @@ -31,10 +39,6 @@ namespace ZImage { : head_dim(head_dim), num_heads(num_heads), num_kv_heads(num_kv_heads), qk_norm(qk_norm) { blocks["qkv"] = std::make_shared(hidden_size, (num_heads + num_kv_heads * 2) * head_dim, false); float scale = 1.f; -#if GGML_USE_HIP - // Prevent NaN issues with certain ROCm setups - scale = 1.f / 16.f; -#endif blocks["out"] = std::make_shared(num_heads * head_dim, hidden_size, false, false, false, scale); if (qk_norm) { blocks["q_norm"] = std::make_shared(head_dim); @@ -51,6 +55,12 @@ namespace ZImage { int64_t N = x->ne[2]; auto qkv_proj = std::dynamic_pointer_cast(blocks["qkv"]); auto out_proj = std::dynamic_pointer_cast(blocks["out"]); +#if GGML_USE_HIP + // Prevent NaN issues with certain ROCm setups + if (ggml_backend_is_cuda(ctx->backend)) { + out_proj->set_scale(1.f / 16.f); + } +#endif auto qkv = qkv_proj->forward(ctx, x); // [N, n_token, (num_heads + num_kv_heads*2)*head_dim] qkv = ggml_reshape_4d(ctx->ggml_ctx, qkv, head_dim, num_heads + num_kv_heads * 2, qkv->ne[1], qkv->ne[2]); // [N, n_token, num_heads + num_kv_heads*2, head_dim] @@ -115,9 +125,7 @@ namespace ZImage { bool force_prec_f32 = false; float scale = 1.f / 128.f; -#ifdef SD_USE_VULKAN - force_prec_f32 = true; -#endif + // The purpose of the scale here is to prevent NaN issues in certain situations. // For example, when using CUDA but the weights are k-quants. blocks["w2"] = std::make_shared(hidden_dim, dim, false, false, force_prec_f32, scale); @@ -128,6 +136,11 @@ namespace ZImage { auto w1 = std::dynamic_pointer_cast(blocks["w1"]); auto w2 = std::dynamic_pointer_cast(blocks["w2"]); auto w3 = std::dynamic_pointer_cast(blocks["w3"]); +#ifdef SD_USE_VULKAN + if(ggml_backend_is_vk(ctx->backend)){ + w2->set_force_prec_f32(true); + } +#endif auto x1 = w1->forward(ctx, x); auto x3 = w3->forward(ctx, x);