@@ -591,11 +591,75 @@ void print_scheduler_config_info(const SchedulerConfig &scheduler_config) {
591591 std::cout << scheduler_config.to_string () << std::endl;
592592}
593593
594+ void import_npu_model (ov::CompiledModel& compiled,
595+ KVDesc& kv_desc,
596+ const ov::AnyMap& config,
597+ const std::string& blob_path) {
598+ if (!std::filesystem::exists (blob_path)) {
599+ OPENVINO_THROW (" Blob file is not found at: " + blob_path);
600+ }
601+ std::ifstream fin (blob_path, std::ios::in | std::ios::binary);
602+ if (!fin.is_open ()) {
603+ OPENVINO_THROW (" Blob file can't be opened: " + blob_path);
604+ }
605+ compiled = ov::genai::utils::singleton_core ().import_model (fin, " NPU" , config);
606+ kv_desc.max_prompt_len = compiled.get_property (" NPUW_LLM_MAX_PROMPT_LEN" ).as <uint32_t >();
607+ kv_desc.min_response_len = compiled.get_property (" NPUW_LLM_MIN_RESPONSE_LEN" ).as <uint32_t >();
608+ }
609+
610+ void export_npu_model (ov::CompiledModel& compiled,
611+ std::string& blob_path) {
612+ if (blob_path.empty ()) {
613+ blob_path = " openvino_model.blob" ;
614+ }
615+ // Check the path is full
616+ const int EXT_SIZE = 5 ; // ".blob"
617+ if (blob_path.size () < EXT_SIZE) {
618+ OPENVINO_THROW (" Please provide a full path to blob file in BLOB_PATH: " + blob_path);
619+ }
620+ if (strncmp (" .blob" , &blob_path[blob_path.size () - EXT_SIZE], EXT_SIZE) != 0 ) {
621+ OPENVINO_THROW (" Please provide a full path to blob file in BLOB_PATH: " + blob_path);
622+ }
623+ std::ofstream fout (blob_path, std::ios::out | std::ios::binary);
624+ if (!fout.is_open ()) {
625+ OPENVINO_THROW (" Blob file can't be exported to: " + blob_path);
626+ }
627+ compiled.export_model (fout);
628+ }
629+
630+ void get_npu_model_config (ov::AnyMap& properties, const KVAxesPosition& kv_pos,
631+ KVDesc& kv_desc, const bool is_whisper) {
632+ if (is_whisper) {
633+ kv_desc.max_prompt_len = pop_int_and_cast (properties, " MAX_PROMPT_LEN" ).value_or (4u );
634+ // kvcache size for Whisper = 448u (MAX_PROMPT_LEN + MIN_RESPONSE_LEN)
635+ kv_desc.min_response_len = pop_int_and_cast (properties, " MIN_RESPONSE_LEN" ).value_or (444u );
636+ update_npu_config_whisper (properties, kv_pos, kv_desc);
637+ } else {
638+ kv_desc.max_prompt_len = pop_int_and_cast (properties, " MAX_PROMPT_LEN" ).value_or (1024u );
639+ kv_desc.min_response_len = pop_int_and_cast (properties, " MIN_RESPONSE_LEN" ).value_or (128u );
640+ update_npu_config (properties, kv_pos, kv_desc);
641+ }
642+ }
643+
644+ void get_npu_text_embedding_config (ov::AnyMap& properties, const KVAxesPosition& kv_pos,
645+ KVDesc& kv_desc, const ov::AnyMap& text_embed_config) {
646+ auto max_len = get_option<uint32_t >(text_embed_config, " MAX_PROMPT_LEN" );
647+ if (max_len.has_value ()) {
648+ kv_desc.max_prompt_len = max_len.value ();
649+ } else {
650+ kv_desc.max_prompt_len = pop_int_and_cast (properties, " MAX_PROMPT_LEN" ).value_or (1024u );
651+ }
652+ kv_desc.min_response_len = kv_desc.max_prompt_len ;
653+ update_npu_config_text_embedding (properties, text_embed_config, kv_pos, kv_desc);
654+ }
655+
594656std::pair<ov::CompiledModel, KVDesc>
595- compile_decoder_for_npu (const std::shared_ptr<ov::Model>& model,
657+ compile_decoder_for_npu_impl (const std::shared_ptr<ov::Model>& model,
596658 const ov::AnyMap& config,
597659 const KVAxesPosition& kv_pos,
598- const bool is_whisper) {
660+ const bool is_whisper,
661+ const bool is_text_embedding,
662+ const ov::AnyMap& text_embed_config = {}) {
599663 ov::CompiledModel compiled;
600664 ov::AnyMap properties = config;
601665 KVDesc kv_desc;
@@ -605,107 +669,38 @@ compile_decoder_for_npu(const std::shared_ptr<ov::Model>& model,
605669 const bool do_import = (!blob_path.empty () && !export_blob);
606670
607671 if (do_import) {
608- if (!std::filesystem::exists (blob_path)) {
609- OPENVINO_THROW (" Blob file is not found at: " + blob_path);
610- }
611- std::ifstream fin (blob_path, std::ios::in | std::ios::binary);
612- if (!fin.is_open ()) {
613- OPENVINO_THROW (" Blob file can't be opened: " + blob_path);
614- }
615- compiled = ov::genai::utils::singleton_core ().import_model (fin, " NPU" , config);
616- kv_desc.max_prompt_len = compiled.get_property (" NPUW_LLM_MAX_PROMPT_LEN" ).as <uint32_t >();
617- kv_desc.min_response_len = compiled.get_property (" NPUW_LLM_MIN_RESPONSE_LEN" ).as <uint32_t >();
672+ import_npu_model (compiled, kv_desc, properties, blob_path);
618673 } else {
619- if (is_whisper) {
620- kv_desc.max_prompt_len = pop_int_and_cast (properties, " MAX_PROMPT_LEN" ).value_or (4u );
621- // kvcache size for Whisper = 448u (MAX_PROMPT_LEN + MIN_RESPONSE_LEN)
622- kv_desc.min_response_len = pop_int_and_cast (properties, " MIN_RESPONSE_LEN" ).value_or (444u );
623- update_npu_config_whisper (properties, kv_pos, kv_desc);
674+ if (is_text_embedding) {
675+ get_npu_text_embedding_config (properties, kv_pos, kv_desc, text_embed_config);
624676 } else {
625- kv_desc.max_prompt_len = pop_int_and_cast (properties, " MAX_PROMPT_LEN" ).value_or (1024u );
626- kv_desc.min_response_len = pop_int_and_cast (properties, " MIN_RESPONSE_LEN" ).value_or (128u );
627- update_npu_config (properties, kv_pos, kv_desc);
677+ get_npu_model_config (properties, kv_pos, kv_desc, is_whisper);
628678 }
679+
629680 compiled = ov::genai::utils::singleton_core ().compile_model (model, " NPU" , properties);
630681 // Also export compiled model if required
631682 if (export_blob) {
632- if (blob_path.empty ()) {
633- blob_path = " openvino_model.blob" ;
634- }
635- // Check the path is full
636- const int EXT_SIZE = 5 ; // ".blob"
637- if (blob_path.size () < EXT_SIZE) {
638- OPENVINO_THROW (" Please provide a full path to blob file in BLOB_PATH: " + blob_path);
639- }
640- if (strncmp (" .blob" , &blob_path[blob_path.size () - EXT_SIZE], EXT_SIZE) != 0 ) {
641- OPENVINO_THROW (" Please provide a full path to blob file in BLOB_PATH: " + blob_path);
642- }
643- std::ofstream fout (blob_path, std::ios::out | std::ios::binary);
644- if (!fout.is_open ()) {
645- OPENVINO_THROW (" Blob file can't be exported to: " + blob_path);
646- }
647- compiled.export_model (fout);
683+ export_npu_model (compiled, blob_path);
648684 }
649685 }
686+
650687 return { compiled, kv_desc };
651688}
652689
690+ std::pair<ov::CompiledModel, KVDesc>
691+ compile_decoder_for_npu (const std::shared_ptr<ov::Model>& model,
692+ const ov::AnyMap& config,
693+ const KVAxesPosition& kv_pos,
694+ const bool is_whisper) {
695+ return compile_decoder_for_npu_impl (model, config, kv_pos, is_whisper, false );
696+ }
697+
653698std::pair<ov::CompiledModel, KVDesc>
654699compile_decoder_for_npu_text_embedding (const std::shared_ptr<ov::Model>& model,
655700 const ov::AnyMap& config,
656701 const KVAxesPosition& kv_pos,
657702 const ov::AnyMap& text_embed_config) {
658- ov::CompiledModel compiled;
659- ov::AnyMap properties = config;
660- KVDesc kv_desc;
661-
662- auto blob_path = pop_or_default (properties, " BLOB_PATH" , std::string{});
663- const auto export_blob = pop_or_default (properties, " EXPORT_BLOB" , false );
664- const bool do_import = (!blob_path.empty () && !export_blob);
665-
666- if (do_import) {
667- if (!std::filesystem::exists (blob_path)) {
668- OPENVINO_THROW (" Blob file is not found at: " + blob_path);
669- }
670- std::ifstream fin (blob_path, std::ios::in | std::ios::binary);
671- if (!fin.is_open ()) {
672- OPENVINO_THROW (" Blob file can't be opened: " + blob_path);
673- }
674- compiled = ov::genai::utils::singleton_core ().import_model (fin, " NPU" , config);
675- kv_desc.max_prompt_len = compiled.get_property (" NPUW_LLM_MAX_PROMPT_LEN" ).as <uint32_t >();
676- kv_desc.min_response_len = compiled.get_property (" NPUW_LLM_MIN_RESPONSE_LEN" ).as <uint32_t >();
677- } else {
678- auto max_len = get_option<uint32_t >(text_embed_config, " MAX_PROMPT_LEN" );
679- if (max_len.has_value ()) {
680- kv_desc.max_prompt_len = max_len.value ();
681- } else {
682- kv_desc.max_prompt_len = pop_int_and_cast (properties, " MAX_PROMPT_LEN" ).value_or (1024u );
683- }
684- kv_desc.min_response_len = kv_desc.max_prompt_len ;
685- update_npu_config_text_embedding (properties, text_embed_config, kv_pos, kv_desc);
686-
687- compiled = ov::genai::utils::singleton_core ().compile_model (model, " NPU" , properties);
688- // Also export compiled model if required
689- if (export_blob) {
690- if (blob_path.empty ()) {
691- blob_path = " openvino_model.blob" ;
692- }
693- // Check the path is full
694- const int EXT_SIZE = 5 ; // ".blob"
695- if (blob_path.size () < EXT_SIZE) {
696- OPENVINO_THROW (" Please provide a full path to blob file in BLOB_PATH: " + blob_path);
697- }
698- if (strncmp (" .blob" , &blob_path[blob_path.size () - EXT_SIZE], EXT_SIZE) != 0 ) {
699- OPENVINO_THROW (" Please provide a full path to blob file in BLOB_PATH: " + blob_path);
700- }
701- std::ofstream fout (blob_path, std::ios::out | std::ios::binary);
702- if (!fout.is_open ()) {
703- OPENVINO_THROW (" Blob file can't be exported to: " + blob_path);
704- }
705- compiled.export_model (fout);
706- }
707- }
708- return { compiled, kv_desc };
703+ return compile_decoder_for_npu_impl (model, config, kv_pos, false , true , text_embed_config);
709704}
710705
711706std::optional<ov::Any> pop_option (ov::AnyMap& config, const std::string& option_name) {
0 commit comments