diff --git a/site/docs/bindings/node-js.md b/site/docs/bindings/node-js.md index 7abaef3504..ece443fbaf 100644 --- a/site/docs/bindings/node-js.md +++ b/site/docs/bindings/node-js.md @@ -9,7 +9,7 @@ description: Node.js bindings provide JavaScript/TypeScript API. OpenVINO GenAI provides Node.js bindings that enable you to use generative AI pipelines in JavaScript and TypeScript applications. :::warning API Coverage -Node.js bindings currently provide a subset of the full OpenVINO GenAI API available in C++ and Python. The focus is on core text generation (`LLMPipeline`) and text embedding (`TextEmbeddingPipeline`) functionality. +Node.js bindings currently provide a subset of the full OpenVINO GenAI API available in C++ and Python. The focus is on core text generation (`LLMPipeline`), vision language models (`VLMPipeline`), and text embedding (`TextEmbeddingPipeline`) functionality. ::: ## Supported Pipelines and Features @@ -23,6 +23,10 @@ Node.js bindings currently support: - Multiple sampling strategies (greedy, beam search) - Structured output - ReAct agent support +- `VLMPipeline`: Vision Language Model inference for multimodal tasks + - Process images and videos with text prompts + - Chat mode with conversation history + - Streaming support - `TextEmbeddingPipeline`: Generate text embeddings for semantic search and RAG applications - `Tokenizer`: Fast tokenization / detokenization and chat prompt formatting - Encode strings into token id and attention mask tensors diff --git a/site/docs/use-cases/image-processing/_sections/_run_model/_code_example_js.mdx b/site/docs/use-cases/image-processing/_sections/_run_model/_code_example_js.mdx new file mode 100644 index 0000000000..f2a495c1e7 --- /dev/null +++ b/site/docs/use-cases/image-processing/_sections/_run_model/_code_example_js.mdx @@ -0,0 +1,40 @@ +import CodeBlock from '@theme/CodeBlock'; + + +{`import { addon as ov } from "openvino-node"; +import { VLMPipeline } from "openvino-genai-node"; +import { stat, readdir } from "node:fs/promises"; +import sharp from "sharp"; +import path from "node:path"; + +async function readImage(imagePath) { + const img = sharp(imagePath); + const metadata = await img.metadata(); + const { width, height, channels } = metadata; + const imageBuffer = await img.raw().toBuffer(); + return new ov.Tensor(ov.element.u8, [height, width, channels], imageBuffer); +} + +async function readImages(imagePath) { + const stats = await stat(imagePath); + if (stats.isDirectory()) { + const files = await readdir(imagePath); + return Promise.all(files.sort().map((file) => readImage(path.join(imagePath, file)))); + } + return [await readImage(imagePath)]; +} + +const images = await readImages("./images"); + +const pipe = await VLMPipeline(modelPath, "${props.device || 'CPU'}"); + +const result = await pipe.generate(prompt, { + images, + generationConfig: { max_new_tokens: 100 }, +}); +console.log(result.texts[0]); + +// To input videos frames, use 'videos' option, frames tensor layout = [Frame, H, W, C] +// const result = await pipe.generate(prompt, { videos: [frames], generationConfig: { max_new_tokens: 100 } }); +`} + diff --git a/site/docs/use-cases/image-processing/_sections/_run_model/index.mdx b/site/docs/use-cases/image-processing/_sections/_run_model/index.mdx index b5082eb1ef..284193977c 100644 --- a/site/docs/use-cases/image-processing/_sections/_run_model/index.mdx +++ b/site/docs/use-cases/image-processing/_sections/_run_model/index.mdx @@ -1,5 +1,6 @@ import CodeExampleCPP from './_code_example_cpp.mdx'; import CodeExamplePython from './_code_example_python.mdx'; +import CodeExampleJS from './_code_example_js.mdx'; ## Run Model Using OpenVINO GenAI @@ -27,6 +28,16 @@ It can generate text from a text prompt and images as inputs. + + + + + + + + + + :::tip diff --git a/site/docs/use-cases/image-processing/_sections/_usage_options/index.mdx b/site/docs/use-cases/image-processing/_sections/_usage_options/index.mdx index 6ef41e98f1..b953df27bf 100644 --- a/site/docs/use-cases/image-processing/_sections/_usage_options/index.mdx +++ b/site/docs/use-cases/image-processing/_sections/_usage_options/index.mdx @@ -81,6 +81,28 @@ Similar to [text generation](/docs/use-cases/text-generation/#use-different-gene } ``` + + ```javascript + import { VLMPipeline } from 'openvino-genai-node'; + + const pipe = await VLMPipeline(modelPath, "CPU", {}); + + // Create custom generation configuration + const config = { + max_new_tokens: 100, + temperature: 0.7, + top_k: 50, + top_p: 0.9, + repetition_penalty: 1.2 + }; + + // Generate text with custom configuration + const output = await pipe.generate(prompt, { + images: images, + generationConfig: config + }); + ``` + diff --git a/site/src/pages/_sections/UseCasesSection/components/image-processing.tsx b/site/src/pages/_sections/UseCasesSection/components/image-processing.tsx index 50786c9ff3..010150c5a2 100644 --- a/site/src/pages/_sections/UseCasesSection/components/image-processing.tsx +++ b/site/src/pages/_sections/UseCasesSection/components/image-processing.tsx @@ -1,10 +1,11 @@ import Button from '@site/src/components/Button'; -import { LanguageTabs, TabItemCpp, TabItemPython } from '@site/src/components/LanguageTabs'; +import { LanguageTabs, TabItemCpp, TabItemPython, TabItemJS } from '@site/src/components/LanguageTabs'; import UseCaseCard from './UseCaseCard'; import CodeExampleCpp from '@site/docs/use-cases/image-processing/_sections/_run_model/_code_example_cpp.mdx'; import CodeExamplePython from '@site/docs/use-cases/image-processing/_sections/_run_model/_code_example_python.mdx'; +import CodeExampleJS from '@site/docs/use-cases/image-processing/_sections/_run_model/_code_example_js.mdx'; export const ImageProcessing = () => ( @@ -27,6 +28,9 @@ export const ImageProcessing = () => ( + + + diff --git a/src/js/include/addon.hpp b/src/js/include/addon.hpp index 28371ba822..c9b89cc610 100644 --- a/src/js/include/addon.hpp +++ b/src/js/include/addon.hpp @@ -9,8 +9,10 @@ typedef Napi::Function (*Prototype)(Napi::Env); struct AddonData { Napi::FunctionReference core; + Napi::FunctionReference vlm_pipeline; Napi::FunctionReference tokenizer; Napi::FunctionReference perf_metrics; + Napi::FunctionReference vlm_perf_metrics; Napi::FunctionReference chat_history; Napi::ObjectReference openvino_addon; }; diff --git a/src/js/include/base/perf_metrics.hpp b/src/js/include/base/perf_metrics.hpp new file mode 100644 index 0000000000..8d85266e14 --- /dev/null +++ b/src/js/include/base/perf_metrics.hpp @@ -0,0 +1,261 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "bindings_utils.hpp" +#include "include/helper.hpp" +#include "openvino/genai/perf_metrics.hpp" + +using ov::genai::common_bindings::utils::get_ms; +using ov::genai::common_bindings::utils::timestamp_to_ms; + +namespace perf_utils { + +inline Napi::Object create_mean_std_pair(Napi::Env env, const ov::genai::MeanStdPair& pair) { + Napi::Object obj = Napi::Object::New(env); + obj.Set("mean", Napi::Number::New(env, pair.mean)); + obj.Set("std", Napi::Number::New(env, pair.std)); + return obj; +} + +inline Napi::Object create_summary_stats(Napi::Env env, const ov::genai::SummaryStats& stats) { + Napi::Object obj = Napi::Object::New(env); + obj.Set("mean", Napi::Number::New(env, stats.mean)); + obj.Set("std", Napi::Number::New(env, stats.std)); + obj.Set("min", Napi::Number::New(env, stats.min)); + obj.Set("max", Napi::Number::New(env, stats.max)); + return obj; +} + +} // namespace perf_utils + +/** + * @brief Base template class for PerfMetrics wrappers. + * + * This class provides common functionality for wrapping ov::genai::PerfMetrics + * and derived classes (like VLMPerfMetrics) in Node.js addon. + * + * @tparam T The derived wrapper class (CRTP pattern). + * @tparam MetricsType The type of metrics to store (default: ov::genai::PerfMetrics). + */ +template +class BasePerfMetricsWrapper : public Napi::ObjectWrap { +public: + using PropertyDescriptor = typename Napi::ObjectWrap::PropertyDescriptor; + + BasePerfMetricsWrapper(const Napi::CallbackInfo& info); + virtual ~BasePerfMetricsWrapper() {} + + /** + * @brief Returns a vector of base class property descriptors. + * + * Derived classes can use this to get all base methods and add their own. + */ + static std::vector get_class_properties(); + + Napi::Value get_load_time(const Napi::CallbackInfo& info); + Napi::Value get_num_generated_tokens(const Napi::CallbackInfo& info); + Napi::Value get_num_input_tokens(const Napi::CallbackInfo& info); + Napi::Value get_ttft(const Napi::CallbackInfo& info); + Napi::Value get_tpot(const Napi::CallbackInfo& info); + Napi::Value get_ipot(const Napi::CallbackInfo& info); + Napi::Value get_throughput(const Napi::CallbackInfo& info); + + Napi::Value get_inference_duration(const Napi::CallbackInfo& info); + Napi::Value get_generate_duration(const Napi::CallbackInfo& info); + Napi::Value get_tokenization_duration(const Napi::CallbackInfo& info); + Napi::Value get_detokenization_duration(const Napi::CallbackInfo& info); + + Napi::Value get_grammar_compiler_init_times(const Napi::CallbackInfo& info); + Napi::Value get_grammar_compile_time(const Napi::CallbackInfo& info); + + /** + * @brief Base implementation of get_raw_metrics. + * + * Derived classes MUST override this method to use it with InstanceAccessor. + * Example: + * + * Napi::Value get_raw_metrics(const Napi::CallbackInfo& info) { + * return BasePerfMetricsWrapper::get_raw_metrics(info); + * } + */ + Napi::Value get_raw_metrics(const Napi::CallbackInfo& info); + Napi::Value add(const Napi::CallbackInfo& info); + MetricsType& get_value(); + +protected: + MetricsType _metrics; +}; + +// Template implementations + +template +BasePerfMetricsWrapper::BasePerfMetricsWrapper(const Napi::CallbackInfo& info) + : Napi::ObjectWrap(info), + _metrics{} {} + +template +std::vector::PropertyDescriptor> +BasePerfMetricsWrapper::get_class_properties() { + return { + T::InstanceMethod("getLoadTime", &T::get_load_time), + T::InstanceMethod("getNumGeneratedTokens", &T::get_num_generated_tokens), + T::InstanceMethod("getNumInputTokens", &T::get_num_input_tokens), + T::InstanceMethod("getTTFT", &T::get_ttft), + T::InstanceMethod("getTPOT", &T::get_tpot), + T::InstanceMethod("getIPOT", &T::get_ipot), + T::InstanceMethod("getThroughput", &T::get_throughput), + T::InstanceMethod("getInferenceDuration", &T::get_inference_duration), + T::InstanceMethod("getGenerateDuration", &T::get_generate_duration), + T::InstanceMethod("getTokenizationDuration", &T::get_tokenization_duration), + T::InstanceMethod("getDetokenizationDuration", &T::get_detokenization_duration), + T::InstanceMethod("getGrammarCompilerInitTimes", &T::get_grammar_compiler_init_times), + T::InstanceMethod("getGrammarCompileTime", &T::get_grammar_compile_time), + T::template InstanceAccessor<&T::get_raw_metrics>("rawMetrics"), + T::InstanceMethod("add", &T::add), + }; +} + +template +Napi::Value BasePerfMetricsWrapper::get_load_time(const Napi::CallbackInfo& info) { + VALIDATE_ARGS_COUNT(info, 0, "getLoadTime()"); + return Napi::Number::New(info.Env(), _metrics.get_load_time()); +} + +template +Napi::Value BasePerfMetricsWrapper::get_num_generated_tokens(const Napi::CallbackInfo& info) { + VALIDATE_ARGS_COUNT(info, 0, "getNumGeneratedTokens()"); + return Napi::Number::New(info.Env(), _metrics.get_num_generated_tokens()); +} + +template +Napi::Value BasePerfMetricsWrapper::get_num_input_tokens(const Napi::CallbackInfo& info) { + VALIDATE_ARGS_COUNT(info, 0, "getNumInputTokens()"); + return Napi::Number::New(info.Env(), _metrics.get_num_input_tokens()); +} + +template +Napi::Value BasePerfMetricsWrapper::get_ttft(const Napi::CallbackInfo& info) { + VALIDATE_ARGS_COUNT(info, 0, "getTTFT()"); + return perf_utils::create_mean_std_pair(info.Env(), _metrics.get_ttft()); +} + +template +Napi::Value BasePerfMetricsWrapper::get_tpot(const Napi::CallbackInfo& info) { + VALIDATE_ARGS_COUNT(info, 0, "getTPOT()"); + return perf_utils::create_mean_std_pair(info.Env(), _metrics.get_tpot()); +} + +template +Napi::Value BasePerfMetricsWrapper::get_ipot(const Napi::CallbackInfo& info) { + VALIDATE_ARGS_COUNT(info, 0, "getIPOT()"); + return perf_utils::create_mean_std_pair(info.Env(), _metrics.get_ipot()); +} + +template +Napi::Value BasePerfMetricsWrapper::get_throughput(const Napi::CallbackInfo& info) { + VALIDATE_ARGS_COUNT(info, 0, "getThroughput()"); + return perf_utils::create_mean_std_pair(info.Env(), _metrics.get_throughput()); +} + +template +Napi::Value BasePerfMetricsWrapper::get_inference_duration(const Napi::CallbackInfo& info) { + VALIDATE_ARGS_COUNT(info, 0, "getInferenceDuration()"); + return perf_utils::create_mean_std_pair(info.Env(), _metrics.get_inference_duration()); +} + +template +Napi::Value BasePerfMetricsWrapper::get_generate_duration(const Napi::CallbackInfo& info) { + VALIDATE_ARGS_COUNT(info, 0, "getGenerateDuration()"); + return perf_utils::create_mean_std_pair(info.Env(), _metrics.get_generate_duration()); +} + +template +Napi::Value BasePerfMetricsWrapper::get_tokenization_duration(const Napi::CallbackInfo& info) { + VALIDATE_ARGS_COUNT(info, 0, "getTokenizationDuration()"); + return perf_utils::create_mean_std_pair(info.Env(), _metrics.get_tokenization_duration()); +} + +template +Napi::Value BasePerfMetricsWrapper::get_detokenization_duration(const Napi::CallbackInfo& info) { + VALIDATE_ARGS_COUNT(info, 0, "getDetokenizationDuration()"); + return perf_utils::create_mean_std_pair(info.Env(), _metrics.get_detokenization_duration()); +} + +template +Napi::Value BasePerfMetricsWrapper::get_grammar_compiler_init_times(const Napi::CallbackInfo& info) { + VALIDATE_ARGS_COUNT(info, 0, "getGrammarCompilerInitTimes()"); + return cpp_map_to_js_object(info.Env(), _metrics.get_grammar_compiler_init_times()); +} + +template +Napi::Value BasePerfMetricsWrapper::get_grammar_compile_time(const Napi::CallbackInfo& info) { + VALIDATE_ARGS_COUNT(info, 0, "getGrammarCompileTime()"); + return perf_utils::create_summary_stats(info.Env(), _metrics.get_grammar_compile_time()); +} + +template +Napi::Value BasePerfMetricsWrapper::get_raw_metrics(const Napi::CallbackInfo& info) { + Napi::Object obj = Napi::Object::New(info.Env()); + obj.Set("generateDurations", + cpp_to_js, Napi::Value>( + info.Env(), + get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::generate_durations))); + obj.Set("tokenizationDurations", + cpp_to_js, Napi::Value>( + info.Env(), + get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::tokenization_durations))); + obj.Set("detokenizationDurations", + cpp_to_js, Napi::Value>( + info.Env(), + get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::detokenization_durations))); + + obj.Set("timesToFirstToken", + cpp_to_js, Napi::Value>( + info.Env(), + get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::m_times_to_first_token))); + obj.Set("newTokenTimes", + cpp_to_js, Napi::Value>( + info.Env(), + timestamp_to_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::m_new_token_times))); + obj.Set("tokenInferDurations", + cpp_to_js, Napi::Value>( + info.Env(), + get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::m_token_infer_durations))); + obj.Set("batchSizes", cpp_to_js, Napi::Value>(info.Env(), _metrics.raw_metrics.m_batch_sizes)); + obj.Set("durations", + cpp_to_js, Napi::Value>( + info.Env(), + get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::m_durations))); + obj.Set("inferenceDurations", + cpp_to_js, Napi::Value>( + info.Env(), + get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::m_inference_durations))); + + obj.Set("grammarCompileTimes", + cpp_to_js, Napi::Value>( + info.Env(), + get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::m_grammar_compile_times))); + + return obj; +} + +template +Napi::Value BasePerfMetricsWrapper::add(const Napi::CallbackInfo& info) { + VALIDATE_ARGS_COUNT(info, 1, "add()"); + const auto env = info.Env(); + try { + _metrics += unwrap(env, info[0]); + } catch (const std::exception& ex) { + Napi::TypeError::New(env, ex.what()).ThrowAsJavaScriptException(); + } + return info.This(); +} + +template +MetricsType& BasePerfMetricsWrapper::get_value() { + return _metrics; +} diff --git a/src/js/include/helper.hpp b/src/js/include/helper.hpp index 55370d91e3..cccfcc5281 100644 --- a/src/js/include/helper.hpp +++ b/src/js/include/helper.hpp @@ -1,9 +1,10 @@ #pragma once #include +#include "openvino/core/type/element_type.hpp" #include "openvino/genai/llm_pipeline.hpp" #include "openvino/genai/rag/text_embedding_pipeline.hpp" -#include "openvino/core/type/element_type.hpp" +#include "openvino/genai/visual_language/pipeline.hpp" #include "openvino/openvino.hpp" template struct overloaded : Ts... {using Ts::operator()...;}; @@ -64,6 +65,8 @@ template <> ov::genai::StructuredOutputConfig::StructuralTag js_to_cpp(const Napi::Env& env, const Napi::Value& value); template <> ov::Tensor js_to_cpp(const Napi::Env& env, const Napi::Value& value); +template <> +std::vector js_to_cpp>(const Napi::Env& env, const Napi::Value& value); /** * @brief Unwraps a C++ object from a JavaScript wrapper. * @tparam TargetType The C++ class type to extract. @@ -75,6 +78,9 @@ TargetType& unwrap(const Napi::Env& env, const Napi::Value& value); template <> ov::genai::PerfMetrics& unwrap(const Napi::Env& env, const Napi::Value& value); +template <> +ov::genai::VLMPerfMetrics& unwrap(const Napi::Env& env, const Napi::Value& value); + /** * @brief Template function to convert C++ data types into Javascript data types * @tparam TargetType Destinated Javascript data type. @@ -144,3 +150,7 @@ std::string json_stringify(const Napi::Env& env, const Napi::Value& value); Napi::Value json_parse(const Napi::Env& env, const std::string& value); Napi::Function get_prototype_from_ov_addon(const Napi::Env& env, const std::string& ctor_name); + +Napi::Object to_decoded_result(const Napi::Env& env, const ov::genai::DecodedResults& results); + +Napi::Object to_vlm_decoded_result(const Napi::Env& env, const ov::genai::VLMDecodedResults& results); diff --git a/src/js/include/perf_metrics.hpp b/src/js/include/perf_metrics.hpp index dd2aa7f587..fc1ddfbb7b 100644 --- a/src/js/include/perf_metrics.hpp +++ b/src/js/include/perf_metrics.hpp @@ -1,36 +1,19 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + #pragma once #include +#include "include/base/perf_metrics.hpp" #include "openvino/genai/perf_metrics.hpp" -class PerfMetricsWrapper : public Napi::ObjectWrap { +class PerfMetricsWrapper : public BasePerfMetricsWrapper { public: PerfMetricsWrapper(const Napi::CallbackInfo& info); static Napi::Function get_class(Napi::Env env); static Napi::Object wrap(Napi::Env env, const ov::genai::PerfMetrics& metrics); - Napi::Value get_load_time(const Napi::CallbackInfo& info); - Napi::Value get_num_generated_tokens(const Napi::CallbackInfo& info); - Napi::Value get_num_input_tokens(const Napi::CallbackInfo& info); - Napi::Value get_ttft(const Napi::CallbackInfo& info); - Napi::Value get_tpot(const Napi::CallbackInfo& info); - Napi::Value get_ipot(const Napi::CallbackInfo& info); - Napi::Value get_throughput(const Napi::CallbackInfo& info); - - Napi::Value get_inference_duration(const Napi::CallbackInfo& info); - Napi::Value get_generate_duration(const Napi::CallbackInfo& info); - Napi::Value get_tokenization_duration(const Napi::CallbackInfo& info); - Napi::Value get_detokenization_duration(const Napi::CallbackInfo& info); - - Napi::Value get_grammar_compiler_init_times(const Napi::CallbackInfo& info); - Napi::Value get_grammar_compile_time(const Napi::CallbackInfo& info); - Napi::Value get_raw_metrics(const Napi::CallbackInfo& info); - Napi::Value add(const Napi::CallbackInfo& info); - ov::genai::PerfMetrics& get_value(); - -private: - ov::genai::PerfMetrics _metrics; }; diff --git a/src/js/include/vlm_pipeline/finish_chat_worker.hpp b/src/js/include/vlm_pipeline/finish_chat_worker.hpp new file mode 100644 index 0000000000..1c8c621e91 --- /dev/null +++ b/src/js/include/vlm_pipeline/finish_chat_worker.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "openvino/genai/visual_language/pipeline.hpp" + +using namespace Napi; + +class VLMFinishChatWorker : public AsyncWorker { +public: + VLMFinishChatWorker(Function& callback, std::shared_ptr& pipe); + virtual ~VLMFinishChatWorker() {} + + void Execute() override; + void OnOK() override; + +private: + std::shared_ptr& pipe; +}; diff --git a/src/js/include/vlm_pipeline/init_worker.hpp b/src/js/include/vlm_pipeline/init_worker.hpp new file mode 100644 index 0000000000..17ca8a794e --- /dev/null +++ b/src/js/include/vlm_pipeline/init_worker.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "openvino/genai/visual_language/pipeline.hpp" + +using namespace Napi; + +class VLMInitWorker : public AsyncWorker { +public: + VLMInitWorker(Function& callback, + std::shared_ptr& pipe, + std::shared_ptr is_initializing, + const std::string model_path, + std::string device, + ov::AnyMap properties); + virtual ~VLMInitWorker() {} + + void Execute() override; + void OnOK() override; + void OnError(const Error& e) override; + +private: + std::shared_ptr& pipe; + std::shared_ptr is_initializing; + std::string model_path; + std::string device; + ov::AnyMap properties; +}; diff --git a/src/js/include/vlm_pipeline/perf_metrics.hpp b/src/js/include/vlm_pipeline/perf_metrics.hpp new file mode 100644 index 0000000000..4333b159c1 --- /dev/null +++ b/src/js/include/vlm_pipeline/perf_metrics.hpp @@ -0,0 +1,21 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "include/base/perf_metrics.hpp" +#include "openvino/genai/visual_language/perf_metrics.hpp" + +class VLMPerfMetricsWrapper : public BasePerfMetricsWrapper { +public: + VLMPerfMetricsWrapper(const Napi::CallbackInfo& info); + + static Napi::Function get_class(Napi::Env env); + static Napi::Object wrap(Napi::Env env, const ov::genai::VLMPerfMetrics& metrics); + + Napi::Value get_prepare_embeddings_duration(const Napi::CallbackInfo& info); + Napi::Value get_raw_metrics(const Napi::CallbackInfo& info); + Napi::Value get_vlm_raw_metrics(const Napi::CallbackInfo& info); +}; diff --git a/src/js/include/vlm_pipeline/start_chat_worker.hpp b/src/js/include/vlm_pipeline/start_chat_worker.hpp new file mode 100644 index 0000000000..cb7ce0ae8d --- /dev/null +++ b/src/js/include/vlm_pipeline/start_chat_worker.hpp @@ -0,0 +1,23 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "openvino/genai/visual_language/pipeline.hpp" + +using namespace Napi; + +class VLMStartChatWorker : public AsyncWorker { +public: + VLMStartChatWorker(Function& callback, std::shared_ptr& pipe, std::string system_message); + virtual ~VLMStartChatWorker() {} + + void Execute() override; + void OnOK() override; + +private: + std::shared_ptr& pipe; + std::string system_message; +}; diff --git a/src/js/include/vlm_pipeline/vlm_pipeline_wrapper.hpp b/src/js/include/vlm_pipeline/vlm_pipeline_wrapper.hpp new file mode 100644 index 0000000000..b7a34ab193 --- /dev/null +++ b/src/js/include/vlm_pipeline/vlm_pipeline_wrapper.hpp @@ -0,0 +1,30 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include + +#include "openvino/genai/visual_language/pipeline.hpp" + +class VLMPipelineWrapper : public Napi::ObjectWrap { +public: + VLMPipelineWrapper(const Napi::CallbackInfo& info); + + static Napi::Function get_class(Napi::Env env); + + Napi::Value init(const Napi::CallbackInfo& info); + Napi::Value generate(const Napi::CallbackInfo& info); + Napi::Value start_chat(const Napi::CallbackInfo& info); + Napi::Value finish_chat(const Napi::CallbackInfo& info); + Napi::Value get_tokenizer(const Napi::CallbackInfo& info); + Napi::Value set_chat_template(const Napi::CallbackInfo& info); + Napi::Value set_generation_config(const Napi::CallbackInfo& info); + +private: + std::shared_ptr pipe = nullptr; + std::shared_ptr is_initializing = std::make_shared(false); + std::shared_ptr is_generating = std::make_shared(false); +}; diff --git a/src/js/lib/addon.ts b/src/js/lib/addon.ts index b6023e5a09..188af9324f 100644 --- a/src/js/lib/addon.ts +++ b/src/js/lib/addon.ts @@ -1,9 +1,12 @@ import { createRequire } from "module"; import { platform } from "node:os"; import { join, dirname, resolve } from "node:path"; +import { Tensor } from "openvino-node"; import type { ChatHistory as IChatHistory } from "./chatHistory.js"; import type { Tokenizer as ITokenizer } from "./tokenizer.js"; import { addon as ovAddon } from "openvino-node"; +import { GenerationConfig, StreamingStatus, VLMPipelineProperties } from "./utils.js"; +import { VLMPerfMetrics } from "./perfMetrics.js"; export type EmbeddingResult = Float32Array | Int8Array | Uint8Array; export type EmbeddingResults = Float32Array[] | Int8Array[] | Uint8Array[]; @@ -58,9 +61,36 @@ export interface TextEmbeddingPipelineWrapper { embedDocumentsSync(documents: string[]): EmbeddingResults; } +export interface VLMPipeline { + new (): VLMPipeline; + init( + modelPath: string, + device: string, + ovProperties: VLMPipelineProperties, + callback: (err: Error | null) => void, + ): void; + generate( + prompt: string, + images: Tensor[] | undefined, + videos: Tensor[] | undefined, + streamer: ((chunk: string) => StreamingStatus) | undefined, + generationConfig: GenerationConfig | undefined, + callback: ( + err: Error | null, + result: { texts: string[]; scores: number[]; perfMetrics: VLMPerfMetrics }, + ) => void, + ): void; + startChat(systemMessage: string, callback: (err: Error | null) => void): void; + finishChat(callback: (err: Error | null) => void): void; + getTokenizer(): ITokenizer; + setChatTemplate(template: string): void; + setGenerationConfig(config: GenerationConfig): void; +} + interface OpenVINOGenAIAddon { TextEmbeddingPipeline: TextEmbeddingPipelineWrapper; LLMPipeline: any; + VLMPipeline: VLMPipeline; ChatHistory: IChatHistory; Tokenizer: ITokenizer; setOpenvinoAddon: (ovAddon: any) => void; @@ -84,6 +114,6 @@ function getGenAIAddon(): OpenVINOGenAIAddon { const addon = getGenAIAddon(); addon.setOpenvinoAddon(ovAddon); -export const { TextEmbeddingPipeline, LLMPipeline, ChatHistory, Tokenizer } = addon; +export const { TextEmbeddingPipeline, LLMPipeline, VLMPipeline, ChatHistory, Tokenizer } = addon; export type ChatHistory = IChatHistory; export type Tokenizer = ITokenizer; diff --git a/src/js/lib/decodedResults.ts b/src/js/lib/decodedResults.ts new file mode 100644 index 0000000000..f74db73508 --- /dev/null +++ b/src/js/lib/decodedResults.ts @@ -0,0 +1,56 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +import { PerfMetrics, VLMPerfMetrics } from "./perfMetrics.js"; + +/** + * Structure to store resulting batched text outputs and scores for each batch. + * @note The first num_return_sequences elements correspond to the first batch element. + */ +export class DecodedResults { + /** + * @param {string[]} texts - Vector of resulting sequences. + * @param {number[]} scores - Scores for each sequence. + * @param {PerfMetrics} perfMetrics - Performance metrics (tpot, ttft, etc.). + */ + constructor(texts: string[], scores: number[], perfMetrics: PerfMetrics) { + this.texts = texts; + this.scores = scores; + this.perfMetrics = perfMetrics; + } + toString() { + if (this.scores.length !== this.texts.length) { + throw new Error("The number of scores and texts doesn't match in DecodedResults."); + } + if (this.texts.length === 0) { + return ""; + } + if (this.texts.length === 1) { + return this.texts[0]; + } + const lines = this.scores.map((score, i) => `${score.toFixed(6)}: ${this.texts[i]}`); + return lines.join("\n"); + } + texts: string[]; + scores: number[]; + perfMetrics: PerfMetrics; +} + +/** + * Structure to store VLM resulting batched text outputs and scores for each batch. + * @note The first num_return_sequences elements correspond to the first batch element. + */ +export class VLMDecodedResults extends DecodedResults { + /** + * @param {string[]} texts - Vector of resulting sequences. + * @param {number[]} scores - Scores for each sequence. + * @param {VLMPerfMetrics} perfMetrics - VLM-specific performance metrics. + */ + constructor(texts: string[], scores: number[], perfMetrics: VLMPerfMetrics) { + super(texts, scores, perfMetrics); + this.perfMetrics = perfMetrics; + } + + /** VLM specific performance metrics. */ + perfMetrics: VLMPerfMetrics; +} diff --git a/src/js/lib/index.ts b/src/js/lib/index.ts index ad8e49168f..dd36cf2227 100644 --- a/src/js/lib/index.ts +++ b/src/js/lib/index.ts @@ -2,8 +2,9 @@ // SPDX-License-Identifier: Apache-2.0 import { LLMPipeline as LLM } from "./pipelines/llmPipeline.js"; +import { VLMPipeline as VLM } from "./pipelines/vlmPipeline.js"; import { TextEmbeddingPipeline as Embedding } from "./pipelines/textEmbeddingPipeline.js"; -import { LLMPipelineProperties } from "./utils.js"; +import { LLMPipelineProperties, VLMPipelineProperties } from "./utils.js"; class PipelineFactory { static async LLMPipeline(modelPath: string, device?: string): Promise; @@ -28,6 +29,18 @@ class PipelineFactory { await pipeline.init(); return pipeline; } + + static async VLMPipeline( + modelPath: string, + device: string = "CPU", + properties: VLMPipelineProperties = {}, + ) { + const pipeline = new VLM(modelPath, device, properties); + await pipeline.init(); + + return pipeline; + } + static async TextEmbeddingPipeline(modelPath: string, device = "CPU", config = {}) { const pipeline = new Embedding(modelPath, device, config); await pipeline.init(); @@ -36,8 +49,9 @@ class PipelineFactory { } } -export const { LLMPipeline, TextEmbeddingPipeline } = PipelineFactory; -export { DecodedResults } from "./pipelines/llmPipeline.js"; +export const { LLMPipeline, VLMPipeline, TextEmbeddingPipeline } = PipelineFactory; +export { DecodedResults, VLMDecodedResults } from "./decodedResults.js"; +export { PerfMetrics, VLMPerfMetrics } from "./perfMetrics.js"; export * from "./utils.js"; export * from "./addon.js"; export type { TokenizedInputs, EncodeOptions, DecodeOptions } from "./tokenizer.js"; diff --git a/src/js/lib/perfMetrics.ts b/src/js/lib/perfMetrics.ts new file mode 100644 index 0000000000..d312505024 --- /dev/null +++ b/src/js/lib/perfMetrics.ts @@ -0,0 +1,118 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +/** Structure holding mean and standard deviation values. */ +export type MeanStdPair = { + mean: number; + std: number; +}; + +/** Structure holding summary of statistical values */ +export type SummaryStats = MeanStdPair & { + min: number; + max: number; +}; + +/** Structure with raw performance metrics for each generation before any statistics are calculated. */ +export type RawMetrics = { + /** Durations for each generate call in milliseconds. */ + generateDurations: number[]; + /** Durations for the tokenization process in milliseconds. */ + tokenizationDurations: number[]; + /** Durations for the detokenization process in milliseconds. */ + detokenizationDurations: number[]; + /** Times to the first token for each call in milliseconds. */ + timesToFirstToken: number[]; + /** Timestamps of generation every token or batch of tokens in milliseconds. */ + newTokenTimes: number[]; + /** Inference time for each token in milliseconds. */ + tokenInferDurations: number[]; + /** Batch sizes for each generate call. */ + batchSizes: number[]; + /** Total durations for each generate call in milliseconds. */ + durations: number[]; + /** Total inference duration for each generate call in microseconds. */ + inferenceDurations: number[]; + /** Time to compile the grammar in milliseconds. */ + grammarCompileTimes: number[]; +}; + +/** Structure with raw performance metrics for VLM generation. */ +export type VLMRawMetrics = { + /** Durations for embedding preparation in milliseconds. */ + prepareEmbeddingsDurations: number[]; +}; + +/** + * Holds performance metrics for each generate call. + * + * PerfMetrics holds the following metrics with mean and standard deviations: + - Time To the First Token (TTFT), ms + - Time per Output Token (TPOT), ms/token + - Inference time per Output Token (IPOT), ms/token + - Generate total duration, ms + - Inference duration, ms + - Tokenization duration, ms + - Detokenization duration, ms + - Throughput, tokens/s + - Load time, ms + - Number of generated tokens + - Number of tokens in the input prompt + - Time to initialize grammar compiler for each backend, ms + - Time to compile grammar, ms + * Preferable way to access metrics is via getter methods. Getter methods calculate mean and std values from rawMetrics and return pairs. + * If mean and std were already calculated, getter methods return cached values. + */ +export interface PerfMetrics { + /** Returns the load time in milliseconds. */ + getLoadTime(): number; + /** Returns the number of generated tokens. */ + getNumGeneratedTokens(): number; + /** Returns the number of tokens in the input prompt. */ + getNumInputTokens(): number; + /** Returns the mean and standard deviation of Time To the First Token (TTFT) in milliseconds. */ + getTTFT(): MeanStdPair; + /** Returns the mean and standard deviation of Time Per Output Token (TPOT) in milliseconds. */ + getTPOT(): MeanStdPair; + /** Returns the mean and standard deviation of Inference time Per Output Token in milliseconds. */ + getIPOT(): MeanStdPair; + /** Returns the mean and standard deviation of throughput in tokens per second. */ + getThroughput(): MeanStdPair; + /** Returns the mean and standard deviation of the time spent on model inference during generate call in milliseconds. */ + getInferenceDuration(): MeanStdPair; + /** Returns the mean and standard deviation of generate durations in milliseconds. */ + getGenerateDuration(): MeanStdPair; + /** Returns the mean and standard deviation of tokenization durations in milliseconds. */ + getTokenizationDuration(): MeanStdPair; + /** Returns the mean and standard deviation of detokenization durations in milliseconds. */ + getDetokenizationDuration(): MeanStdPair; + /** Returns a map with the time to initialize the grammar compiler for each backend in milliseconds. */ + getGrammarCompilerInitTimes(): { [key: string]: number }; + /** Returns the mean, standard deviation, min, and max of grammar compile times in milliseconds. */ + getGrammarCompileTime(): SummaryStats; + /** A structure of RawPerfMetrics type that holds raw metrics. */ + rawMetrics: RawMetrics; + + /** Adds the metrics from another PerfMetrics object to this one. + * @returns The current PerfMetrics instance. + */ + add(other: PerfMetrics): this; +} + +/** + * Holds performance metrics for each VLM generate call. + * + * VLMPerfMetrics extends PerfMetrics with VLM-specific metrics: + * - Prepare embeddings duration, ms + */ +export interface VLMPerfMetrics extends PerfMetrics { + /** Returns the mean and standard deviation of embeddings preparation duration in milliseconds. */ + getPrepareEmbeddingsDuration(): MeanStdPair; + /** VLM specific raw metrics */ + vlmRawMetrics: VLMRawMetrics; + + /** Adds the metrics from another VLMPerfMetrics object to this one. + * @returns The current VLMPerfMetrics instance. + */ + add(other: VLMPerfMetrics): this; +} diff --git a/src/js/lib/pipelines/llmPipeline.ts b/src/js/lib/pipelines/llmPipeline.ts index f05e654e5e..ce2a418436 100644 --- a/src/js/lib/pipelines/llmPipeline.ts +++ b/src/js/lib/pipelines/llmPipeline.ts @@ -1,6 +1,7 @@ import util from "node:util"; import { ChatHistory, LLMPipeline as LLMPipelineWrap } from "../addon.js"; import { GenerationConfig, StreamingStatus, LLMPipelineProperties } from "../utils.js"; +import { DecodedResults } from "../decodedResults.js"; import { Tokenizer } from "../tokenizer.js"; export type ResolveFunction = (arg: { value: string; done: boolean }) => void; @@ -9,131 +10,6 @@ export type Options = { max_new_tokens?: number; }; -/** Structure with raw performance metrics for each generation before any statistics are calculated. */ -export type RawMetrics = { - /** Durations for each generate call in milliseconds. */ - generateDurations: number[]; - /** Durations for the tokenization process in milliseconds. */ - tokenizationDurations: number[]; - /** Durations for the detokenization process in milliseconds. */ - detokenizationDurations: number[]; - /** Times to the first token for each call in milliseconds. */ - timesToFirstToken: number[]; - /** Timestamps of generation every token or batch of tokens in milliseconds. */ - newTokenTimes: number[]; - /** Inference time for each token in milliseconds. */ - tokenInferDurations: number[]; - /** Batch sizes for each generate call. */ - batchSizes: number[]; - /** Total durations for each generate call in milliseconds. */ - durations: number[]; - /** Total inference duration for each generate call in microseconds. */ - inferenceDurations: number[]; - /** Time to compile the grammar in milliseconds. */ - grammarCompileTimes: number[]; -}; - -/** Structure holding mean and standard deviation values. */ -export type MeanStdPair = { - mean: number; - std: number; -}; - -/** Structure holding summary of statistical values */ -export type SummaryStats = { - mean: number; - std: number; - min: number; - max: number; -}; - -/** - * Holds performance metrics for each generate call. - * - * PerfMetrics holds the following metrics with mean and standard deviations: - - Time To the First Token (TTFT), ms - - Time per Output Token (TPOT), ms/token - - Inference time per Output Token (IPOT), ms/token - - Generate total duration, ms - - Inference duration, ms - - Tokenization duration, ms - - Detokenization duration, ms - - Throughput, tokens/s - - Load time, ms - - Number of generated tokens - - Number of tokens in the input prompt - - Time to initialize grammar compiler for each backend, ms - - Time to compile grammar, ms - * Preferable way to access metrics is via getter methods. Getter methods calculate mean and std values from rawMetrics and return pairs. - * If mean and std were already calculated, getter methods return cached values. - */ -export interface PerfMetrics { - /** Returns the load time in milliseconds. */ - getLoadTime(): number; - /** Returns the number of generated tokens. */ - getNumGeneratedTokens(): number; - /** Returns the number of tokens in the input prompt. */ - getNumInputTokens(): number; - /** Returns the mean and standard deviation of Time To the First Token (TTFT) in milliseconds. */ - getTTFT(): MeanStdPair; - /** Returns the mean and standard deviation of Time Per Output Token (TPOT) in milliseconds. */ - getTPOT(): MeanStdPair; - /** Returns the mean and standard deviation of Inference time Per Output Token in milliseconds. */ - getIPOT(): MeanStdPair; - /** Returns the mean and standard deviation of throughput in tokens per second. */ - getThroughput(): MeanStdPair; - /** Returns the mean and standard deviation of the time spent on model inference during generate call in milliseconds. */ - getInferenceDuration(): MeanStdPair; - /** Returns the mean and standard deviation of generate durations in milliseconds. */ - getGenerateDuration(): MeanStdPair; - /** Returns the mean and standard deviation of tokenization durations in milliseconds. */ - getTokenizationDuration(): MeanStdPair; - /** Returns the mean and standard deviation of detokenization durations in milliseconds. */ - getDetokenizationDuration(): MeanStdPair; - /** Returns a map with the time to initialize the grammar compiler for each backend in milliseconds. */ - getGrammarCompilerInitTimes(): { [key: string]: number }; - /** Returns the mean, standard deviation, min, and max of grammar compile times in milliseconds. */ - getGrammarCompileTime(): SummaryStats; - /** A structure of RawPerfMetrics type that holds raw metrics. */ - rawMetrics: RawMetrics; - - /** Adds the metrics from another PerfMetrics object to this one. - * @returns The current PerfMetrics instance. - */ - add(other: PerfMetrics): this; -} - -export class DecodedResults { - constructor(texts: string[], scores: number[], perfMetrics: PerfMetrics) { - this.texts = texts; - this.scores = scores; - this.perfMetrics = perfMetrics; - } - toString() { - if (this.scores.length !== this.texts.length) { - throw new Error("The number of scores and texts doesn't match in DecodedResults."); - } - if (this.texts.length === 0) { - return ""; - } - if (this.texts.length === 1) { - return this.texts[0]; - } - let result = ""; - for (let i = 0; i < this.texts.length - 1; ++i) { - result += `${this.scores[i].toFixed(6)}: ${this.texts[i]}\n`; - } - result += `${this.scores[this.scores.length - 1].toFixed( - 6, - )}: ${this.texts[this.texts.length - 1]}`; - - return result; - } - texts: string[]; - scores: number[]; - perfMetrics: PerfMetrics; -} - export class LLMPipeline { modelPath: string; device: string; diff --git a/src/js/lib/pipelines/vlmPipeline.ts b/src/js/lib/pipelines/vlmPipeline.ts new file mode 100644 index 0000000000..cdede8ee25 --- /dev/null +++ b/src/js/lib/pipelines/vlmPipeline.ts @@ -0,0 +1,225 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +import util from "node:util"; +import { VLMPipeline as VLMPipelineWrapper } from "../addon.js"; +import { GenerationConfig, VLMPipelineProperties, StreamingStatus } from "../utils.js"; +import { VLMDecodedResults } from "../decodedResults.js"; +import { Tokenizer } from "../tokenizer.js"; +import type { Tensor } from "openvino-node"; +import { VLMPerfMetrics } from "../perfMetrics.js"; + +/** + * Options for VLM generation methods. + */ +export type VLMGenerateOptions = { + /** Array of image tensors to include in the prompt. */ + images?: Tensor[]; + /** Array of video frame tensors to include in the prompt. */ + videos?: Tensor[]; + /** Generation configuration parameters such as max_length, temperature, etc. */ + generationConfig?: GenerationConfig; +}; + +/** + * This class is used for generation with Visual Language Models (VLMs) + */ +export class VLMPipeline { + protected readonly modelPath: string; + protected readonly device: string; + protected pipeline: VLMPipelineWrapper | null = null; + protected readonly properties: VLMPipelineProperties; + + /** + * Construct a VLM pipeline from a folder containing tokenizer and model IRs. + * @param modelPath - A folder to read tokenizer and model IRs. + * @param device - Inference device. A tokenizer is always compiled for CPU. + * @param properties - Device and pipeline properties. + */ + constructor(modelPath: string, device: string, properties: VLMPipelineProperties) { + this.modelPath = modelPath; + this.device = device; + this.properties = properties; + } + + /** + * Initialize the underlying native pipeline. + * @returns Resolves when initialization is complete. + */ + async init() { + const pipeline = new VLMPipelineWrapper(); + + const initPromise = util.promisify(pipeline.init.bind(pipeline)); + await initPromise(this.modelPath, this.device, this.properties); + + this.pipeline = pipeline; + } + /** + * Start a chat session with an optional system message. + * @param systemMessage - Optional system message to initialize chat context. + * @returns Resolves when chat session is started. + */ + async startChat(systemMessage: string = "") { + if (!this.pipeline) throw new Error("Pipeline is not initialized"); + + const startChatPromise = util.promisify(this.pipeline.startChat.bind(this.pipeline)); + const result = await startChatPromise(systemMessage); + + return result; + } + /** + * Finish the current chat session and clear chat-related state. + * @returns Resolves when chat session is finished. + */ + async finishChat() { + if (!this.pipeline) throw new Error("Pipeline is not initialized"); + + const finishChatPromise = util.promisify(this.pipeline.finishChat.bind(this.pipeline)); + const result = await finishChatPromise(); + + return result; + } + /** + * Stream generation results as an async iterator of strings. + * The iterator yields subword chunks. + * @param prompt - Input prompt. May contain image/video tags recognized by the model. + * @param options - Optional parameters. + * @param options.images - Array of image tensors to include in the prompt. + * @param options.videos - Array of video frame tensors to include in the prompt. + * @param options.generationConfig - Generation parameters. + * @returns Async iterator producing subword chunks. + */ + stream(prompt: string, options: VLMGenerateOptions = {}): AsyncIterableIterator { + if (!this.pipeline) throw new Error("Pipeline is not initialized"); + const { images, videos, generationConfig } = options; + + let streamingStatus: StreamingStatus = StreamingStatus.RUNNING; + const queue: { done: boolean; subword: string }[] = []; + type ResolveFunction = (arg: { value: string; done: boolean }) => void; + type RejectFunction = (reason?: unknown) => void; + let resolvePromise: ResolveFunction | null; + let rejectPromise: RejectFunction | null; + + const callback = ( + error: Error | null, + result: { texts: string[]; scores: number[]; perfMetrics: VLMPerfMetrics }, + ) => { + if (error) { + if (rejectPromise) { + rejectPromise(error); + // Reset promises + resolvePromise = null; + rejectPromise = null; + } else { + throw error; + } + } else { + const decodedResult = new VLMDecodedResults( + result.texts, + result.scores, + result.perfMetrics, + ); + const fullText = decodedResult.toString(); + if (resolvePromise) { + // Fulfill pending request + resolvePromise({ done: true, value: fullText }); + // Reset promises + resolvePromise = null; + rejectPromise = null; + } else { + // Add data to queue if no pending promise + queue.push({ done: true, subword: fullText }); + } + } + }; + + const streamer = (chunk: string): StreamingStatus => { + if (resolvePromise) { + // Fulfill pending request + resolvePromise({ done: false, value: chunk }); + // Reset promises + resolvePromise = null; + rejectPromise = null; + } else { + // Add data to queue if no pending promise + queue.push({ done: false, subword: chunk }); + } + return streamingStatus; + }; + + this.pipeline.generate(prompt, images, videos, streamer, generationConfig, callback); + + return { + async next() { + // If there is data in the queue, return it + // Otherwise, return a promise that will resolve when data is available + const data = queue.shift(); + + if (data) { + return { value: data.subword, done: data.done }; + } + + return new Promise((resolve: ResolveFunction, reject: (reason?: unknown) => void) => { + resolvePromise = resolve; + rejectPromise = reject; + }); + }, + async return() { + streamingStatus = StreamingStatus.CANCEL; + + return { done: true, value: "" }; + }, + [Symbol.asyncIterator]() { + return this; + }, + }; + } + /** + * Generate sequences for VLMs. + * @param prompt - Input prompt. May contain model-specific image/video tags. + * @param options - Optional parameters. + * @param options.images - Images to include in the prompt. + * @param options.videos - Videos to include in the prompt. + * @param options.generationConfig - Generation configuration parameters. + * @param options.streamer - Optional streamer callback called for each chunk. + * @returns Resolves with decoded results once generation finishes. + */ + async generate( + prompt: string, + options: VLMGenerateOptions & { streamer?: (chunk: string) => StreamingStatus } = {}, + ): Promise { + const { images, videos, generationConfig, streamer } = options; + if (!this.pipeline) throw new Error("Pipeline is not initialized"); + const innerGenerate = util.promisify(this.pipeline.generate.bind(this.pipeline)); + const result = await innerGenerate(prompt, images, videos, streamer, generationConfig); + + return new VLMDecodedResults(result.texts, result.scores, result.perfMetrics); + } + + /** + * Get the pipeline tokenizer instance. + * @returns Tokenizer used by the pipeline. + */ + getTokenizer(): Tokenizer { + if (!this.pipeline) throw new Error("Pipeline is not initialized"); + return this.pipeline.getTokenizer(); + } + + /** + * Set the chat template used when formatting chat history and prompts. + * @param chatTemplate - Chat template string. + */ + setChatTemplate(chatTemplate: string): void { + if (!this.pipeline) throw new Error("Pipeline is not initialized"); + this.pipeline.setChatTemplate(chatTemplate); + } + + /** + * Set generation configuration parameters. + * @param config - Generation configuration parameters. + */ + setGenerationConfig(config: GenerationConfig): void { + if (!this.pipeline) throw new Error("Pipeline is not initialized"); + this.pipeline.setGenerationConfig(config); + } +} diff --git a/src/js/lib/utils.ts b/src/js/lib/utils.ts index 43684e9fa0..17211549e2 100644 --- a/src/js/lib/utils.ts +++ b/src/js/lib/utils.ts @@ -344,3 +344,7 @@ export type SchedulerConfig = { export type LLMPipelineProperties = { schedulerConfig?: SchedulerConfig; }; + +export type VLMPipelineProperties = { + schedulerConfig?: SchedulerConfig; +} & Record; diff --git a/src/js/src/addon.cpp b/src/js/src/addon.cpp index 72cb3b6b16..80c8d800b1 100644 --- a/src/js/src/addon.cpp +++ b/src/js/src/addon.cpp @@ -5,6 +5,8 @@ #include "include/perf_metrics.hpp" #include "include/llm_pipeline/llm_pipeline_wrapper.hpp" +#include "include/vlm_pipeline/vlm_pipeline_wrapper.hpp" +#include "include/vlm_pipeline/perf_metrics.hpp" #include "include/text_embedding_pipeline/pipeline_wrapper.hpp" #include "include/tokenizer.hpp" #include "include/chat_history.hpp" @@ -47,9 +49,11 @@ Napi::Object init_module(Napi::Env env, Napi::Object exports) { env.SetInstanceData(addon_data); init_class(env, exports, "LLMPipeline", &LLMPipelineWrapper::get_class, addon_data->core); + init_class(env, exports, "VLMPipeline", &VLMPipelineWrapper::get_class, addon_data->vlm_pipeline); init_class(env, exports, "TextEmbeddingPipeline", &TextEmbeddingPipelineWrapper::get_class, addon_data->core); init_class(env, exports, "Tokenizer", &TokenizerWrapper::get_class, addon_data->tokenizer); init_class(env, exports, "PerfMetrics", &PerfMetricsWrapper::get_class, addon_data->perf_metrics); + init_class(env, exports, "VLMPerfMetrics", &VLMPerfMetricsWrapper::get_class, addon_data->vlm_perf_metrics); init_class(env, exports, "ChatHistory", &ChatHistoryWrap::get_class, addon_data->chat_history); // Expose a helper to set the openvino-node addon from JS (useful for ESM) diff --git a/src/js/src/helper.cpp b/src/js/src/helper.cpp index 5414a7f522..523f09568d 100644 --- a/src/js/src/helper.cpp +++ b/src/js/src/helper.cpp @@ -3,6 +3,7 @@ #include "include/addon.hpp" #include "include/chat_history.hpp" #include "include/perf_metrics.hpp" +#include "include/vlm_pipeline/perf_metrics.hpp" namespace { constexpr const char* JS_SCHEDULER_CONFIG_KEY = "schedulerConfig"; @@ -337,6 +338,25 @@ ov::Tensor js_to_cpp(const Napi::Env& env, const Napi::Value& value) return *tensor_ptr; } +template <> +std::vector js_to_cpp>(const Napi::Env& env, const Napi::Value& value) { + std::vector tensors; + if (value.IsUndefined() || value.IsNull()) { + return tensors; + } + if (value.IsArray()) { + auto array = value.As(); + size_t length = array.Length(); + tensors.reserve(length); + for (uint32_t i = 0; i < length; ++i) { + tensors.push_back(js_to_cpp(env, array[i])); + } + } else { + OPENVINO_THROW("Passed argument must be an array of Tensors."); + } + return tensors; +} + template <> ov::genai::PerfMetrics& unwrap(const Napi::Env& env, const Napi::Value& value) { const auto obj = value.As(); @@ -350,6 +370,17 @@ ov::genai::PerfMetrics& unwrap(const Napi::Env& env, con return js_metrics->get_value(); } +template <> +ov::genai::VLMPerfMetrics& unwrap(const Napi::Env& env, const Napi::Value& value) { + const auto obj = value.As(); + const auto& prototype = env.GetInstanceData()->vlm_perf_metrics; + OPENVINO_ASSERT(prototype, "Invalid pointer to prototype."); + OPENVINO_ASSERT(obj.InstanceOf(prototype.Value().As()), + "Passed argument is not of type VLMPerfMetrics"); + const auto js_metrics = Napi::ObjectWrap::Unwrap(obj); + return js_metrics->get_value(); +} + template <> ov::genai::ChatHistory& unwrap(const Napi::Env& env, const Napi::Value& value) { OPENVINO_ASSERT(value.IsObject(), "Passed argument must be an object."); @@ -539,3 +570,20 @@ Napi::Function get_prototype_from_ov_addon(const Napi::Env& env, const std::stri return ctor_val.As(); } + +Napi::Object to_decoded_result(const Napi::Env& env, const ov::genai::DecodedResults& results) { + Napi::Object obj = Napi::Object::New(env); + obj.Set("texts", cpp_to_js, Napi::Value>(env, results.texts)); + obj.Set("scores", cpp_to_js, Napi::Value>(env, results.scores)); + obj.Set("perfMetrics", PerfMetricsWrapper::wrap(env, results.perf_metrics)); + obj.Set("subword", Napi::String::New(env, results)); + return obj; +} + +Napi::Object to_vlm_decoded_result(const Napi::Env& env, const ov::genai::VLMDecodedResults& results) { + Napi::Object obj = Napi::Object::New(env); + obj.Set("texts", cpp_to_js, Napi::Value>(env, results.texts)); + obj.Set("scores", cpp_to_js, Napi::Value>(env, results.scores)); + obj.Set("perfMetrics", VLMPerfMetricsWrapper::wrap(env, results.perf_metrics)); + return obj; +} diff --git a/src/js/src/llm_pipeline/llm_pipeline_wrapper.cpp b/src/js/src/llm_pipeline/llm_pipeline_wrapper.cpp index 6a78ad24f8..19327b1e60 100644 --- a/src/js/src/llm_pipeline/llm_pipeline_wrapper.cpp +++ b/src/js/src/llm_pipeline/llm_pipeline_wrapper.cpp @@ -22,15 +22,6 @@ struct TsfnContext { std::shared_ptr options = nullptr; }; -Napi::Object create_decoded_results_object(Napi::Env env, const ov::genai::DecodedResults& result) { - Napi::Object obj = Napi::Object::New(env); - obj.Set("texts", cpp_to_js, Napi::Value>(env, result.texts)); - obj.Set("scores", cpp_to_js, Napi::Value>(env, result.scores)); - obj.Set("perfMetrics", PerfMetricsWrapper::wrap(env, result.perf_metrics)); - obj.Set("subword", Napi::String::New(env, result)); - return obj; -} - void performInferenceThread(TsfnContext* context) { try { ov::genai::GenerationConfig config; @@ -89,7 +80,7 @@ void performInferenceThread(TsfnContext* context) { }, context->inputs); napi_status status = context->tsfn.BlockingCall([result](Napi::Env env, Napi::Function jsCallback) { - jsCallback.Call({Napi::Boolean::New(env, true), create_decoded_results_object(env, result)}); + jsCallback.Call({Napi::Boolean::New(env, true), to_decoded_result(env, result)}); }); if (status != napi_ok) { diff --git a/src/js/src/perf_metrics.cpp b/src/js/src/perf_metrics.cpp index 1dfdbb62bb..cfce4c24df 100644 --- a/src/js/src/perf_metrics.cpp +++ b/src/js/src/perf_metrics.cpp @@ -1,37 +1,16 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + #include "include/perf_metrics.hpp" -#include "bindings_utils.hpp" #include "include/addon.hpp" -#include "include/helper.hpp" - -using ov::genai::common_bindings::utils::get_ms; -using ov::genai::common_bindings::utils::timestamp_to_ms; PerfMetricsWrapper::PerfMetricsWrapper(const Napi::CallbackInfo& info) - : Napi::ObjectWrap(info), - _metrics{} {}; + : BasePerfMetricsWrapper(info) {} Napi::Function PerfMetricsWrapper::get_class(Napi::Env env) { - return DefineClass( - env, - "PerfMetrics", - { - InstanceMethod("getLoadTime", &PerfMetricsWrapper::get_load_time), - InstanceMethod("getNumGeneratedTokens", &PerfMetricsWrapper::get_num_generated_tokens), - InstanceMethod("getNumInputTokens", &PerfMetricsWrapper::get_num_input_tokens), - InstanceMethod("getTTFT", &PerfMetricsWrapper::get_ttft), - InstanceMethod("getTPOT", &PerfMetricsWrapper::get_tpot), - InstanceMethod("getIPOT", &PerfMetricsWrapper::get_ipot), - InstanceMethod("getThroughput", &PerfMetricsWrapper::get_throughput), - InstanceMethod("getInferenceDuration", &PerfMetricsWrapper::get_inference_duration), - InstanceMethod("getGenerateDuration", &PerfMetricsWrapper::get_generate_duration), - InstanceMethod("getTokenizationDuration", &PerfMetricsWrapper::get_tokenization_duration), - InstanceMethod("getDetokenizationDuration", &PerfMetricsWrapper::get_detokenization_duration), - InstanceMethod("getGrammarCompilerInitTimes", &PerfMetricsWrapper::get_grammar_compiler_init_times), - InstanceMethod("getGrammarCompileTime", &PerfMetricsWrapper::get_grammar_compile_time), - InstanceAccessor<&PerfMetricsWrapper::get_raw_metrics>("rawMetrics"), - InstanceMethod("add", &PerfMetricsWrapper::add), - }); + auto properties = BasePerfMetricsWrapper::get_class_properties(); + return DefineClass(env, "PerfMetrics", properties); } Napi::Object PerfMetricsWrapper::wrap(Napi::Env env, const ov::genai::PerfMetrics& metrics) { @@ -43,143 +22,6 @@ Napi::Object PerfMetricsWrapper::wrap(Napi::Env env, const ov::genai::PerfMetric return obj; } -Napi::Value PerfMetricsWrapper::get_load_time(const Napi::CallbackInfo& info) { - VALIDATE_ARGS_COUNT(info, 0, "getLoadTime()"); - return Napi::Number::New(info.Env(), _metrics.get_load_time()); -} - -Napi::Value PerfMetricsWrapper::get_num_generated_tokens(const Napi::CallbackInfo& info) { - VALIDATE_ARGS_COUNT(info, 0, "getNumGeneratedTokens()"); - return Napi::Number::New(info.Env(), _metrics.get_num_generated_tokens()); -} - -Napi::Value PerfMetricsWrapper::get_num_input_tokens(const Napi::CallbackInfo& info) { - VALIDATE_ARGS_COUNT(info, 0, "getNumInputTokens()"); - return Napi::Number::New(info.Env(), _metrics.get_num_input_tokens()); -} - -Napi::Object create_mean_std_pair(Napi::Env env, const ov::genai::MeanStdPair& pair) { - Napi::Object obj = Napi::Object::New(env); - obj.Set("mean", Napi::Number::New(env, pair.mean)); - obj.Set("std", Napi::Number::New(env, pair.std)); - return obj; -} - -Napi::Object create_summary_stats(Napi::Env env, const ov::genai::SummaryStats& stats) { - Napi::Object obj = Napi::Object::New(env); - obj.Set("mean", Napi::Number::New(env, stats.mean)); - obj.Set("std", Napi::Number::New(env, stats.std)); - obj.Set("min", Napi::Number::New(env, stats.min)); - obj.Set("max", Napi::Number::New(env, stats.max)); - return obj; -} - -Napi::Value PerfMetricsWrapper::get_ttft(const Napi::CallbackInfo& info) { - VALIDATE_ARGS_COUNT(info, 0, "getTTFT()"); - return create_mean_std_pair(info.Env(), _metrics.get_ttft()); -} - -Napi::Value PerfMetricsWrapper::get_tpot(const Napi::CallbackInfo& info) { - VALIDATE_ARGS_COUNT(info, 0, "getTPOT()"); - return create_mean_std_pair(info.Env(), _metrics.get_tpot()); -} - -Napi::Value PerfMetricsWrapper::get_ipot(const Napi::CallbackInfo& info) { - VALIDATE_ARGS_COUNT(info, 0, "getIPOT()"); - return create_mean_std_pair(info.Env(), _metrics.get_ipot()); -} - -Napi::Value PerfMetricsWrapper::get_throughput(const Napi::CallbackInfo& info) { - VALIDATE_ARGS_COUNT(info, 0, "getThroughput()"); - return create_mean_std_pair(info.Env(), _metrics.get_throughput()); -} - -Napi::Value PerfMetricsWrapper::get_inference_duration(const Napi::CallbackInfo& info) { - VALIDATE_ARGS_COUNT(info, 0, "getInferenceDuration()"); - return create_mean_std_pair(info.Env(), _metrics.get_inference_duration()); -} - -Napi::Value PerfMetricsWrapper::get_generate_duration(const Napi::CallbackInfo& info) { - VALIDATE_ARGS_COUNT(info, 0, "getGenerateDuration()"); - return create_mean_std_pair(info.Env(), _metrics.get_generate_duration()); -} - -Napi::Value PerfMetricsWrapper::get_tokenization_duration(const Napi::CallbackInfo& info) { - VALIDATE_ARGS_COUNT(info, 0, "getTokenizationDuration()"); - return create_mean_std_pair(info.Env(), _metrics.get_tokenization_duration()); -} - -Napi::Value PerfMetricsWrapper::get_grammar_compiler_init_times(const Napi::CallbackInfo& info) { - VALIDATE_ARGS_COUNT(info, 0, "getGrammarCompilerInitTimes()"); - return cpp_map_to_js_object(info.Env(), _metrics.get_grammar_compiler_init_times()); -} - -Napi::Value PerfMetricsWrapper::get_grammar_compile_time(const Napi::CallbackInfo& info) { - VALIDATE_ARGS_COUNT(info, 0, "getGrammarCompileTime()"); - return create_summary_stats(info.Env(), _metrics.get_grammar_compile_time()); -}; - -Napi::Value PerfMetricsWrapper::get_detokenization_duration(const Napi::CallbackInfo& info) { - VALIDATE_ARGS_COUNT(info, 0, "getDetokenizationDuration()"); - return create_mean_std_pair(info.Env(), _metrics.get_detokenization_duration()); -} - Napi::Value PerfMetricsWrapper::get_raw_metrics(const Napi::CallbackInfo& info) { - Napi::Object obj = Napi::Object::New(info.Env()); - obj.Set("generateDurations", - cpp_to_js, Napi::Value>( - info.Env(), - get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::generate_durations))); - obj.Set("tokenizationDurations", - cpp_to_js, Napi::Value>( - info.Env(), - get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::tokenization_durations))); - obj.Set("detokenizationDurations", - cpp_to_js, Napi::Value>( - info.Env(), - get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::detokenization_durations))); - - obj.Set("timesToFirstToken", - cpp_to_js, Napi::Value>( - info.Env(), - get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::m_times_to_first_token))); - obj.Set("newTokenTimes", - cpp_to_js, Napi::Value>( - info.Env(), - timestamp_to_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::m_new_token_times))); - obj.Set("tokenInferDurations", - cpp_to_js, Napi::Value>( - info.Env(), - get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::m_token_infer_durations))); - obj.Set("batchSizes", cpp_to_js, Napi::Value>(info.Env(), _metrics.raw_metrics.m_batch_sizes)); - obj.Set("durations", - cpp_to_js, Napi::Value>( - info.Env(), - get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::m_durations))); - obj.Set("inferenceDurations", - cpp_to_js, Napi::Value>( - info.Env(), - get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::m_inference_durations))); - - obj.Set("grammarCompileTimes", - cpp_to_js, Napi::Value>( - info.Env(), - get_ms(_metrics.raw_metrics, &ov::genai::RawPerfMetrics::m_grammar_compile_times))); - - return obj; -} - -Napi::Value PerfMetricsWrapper::add(const Napi::CallbackInfo& info) { - VALIDATE_ARGS_COUNT(info, 1, "add()"); - const auto env = info.Env(); - try { - _metrics += unwrap(env, info[0]); - } catch (const std::exception& ex) { - Napi::TypeError::New(env, ex.what()).ThrowAsJavaScriptException(); - } - return info.This(); -} - -ov::genai::PerfMetrics& PerfMetricsWrapper::get_value() { - return _metrics; + return BasePerfMetricsWrapper::get_raw_metrics(info); } diff --git a/src/js/src/vlm_pipeline/finish_chat_worker.cpp b/src/js/src/vlm_pipeline/finish_chat_worker.cpp new file mode 100644 index 0000000000..764be64719 --- /dev/null +++ b/src/js/src/vlm_pipeline/finish_chat_worker.cpp @@ -0,0 +1,16 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "include/vlm_pipeline/finish_chat_worker.hpp" + +VLMFinishChatWorker::VLMFinishChatWorker(Function& callback, std::shared_ptr& pipe) + : AsyncWorker(callback), + pipe(pipe) {}; + +void VLMFinishChatWorker::Execute() { + this->pipe->finish_chat(); +}; + +void VLMFinishChatWorker::OnOK() { + Callback().Call({Env().Null()}); +}; diff --git a/src/js/src/vlm_pipeline/init_worker.cpp b/src/js/src/vlm_pipeline/init_worker.cpp new file mode 100644 index 0000000000..49e93608da --- /dev/null +++ b/src/js/src/vlm_pipeline/init_worker.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "include/vlm_pipeline/init_worker.hpp" + +VLMInitWorker::VLMInitWorker(Function& callback, + std::shared_ptr& pipe, + std::shared_ptr is_initializing, + const std::string model_path, + const std::string device, + const ov::AnyMap properties) + : AsyncWorker(callback), + pipe(pipe), + is_initializing(is_initializing), + model_path(model_path), + device(device), + properties(properties) {}; + +void VLMInitWorker::Execute() { + *this->is_initializing = true; + this->pipe = std::make_shared(this->model_path, this->device, this->properties); +}; + +void VLMInitWorker::OnOK() { + *this->is_initializing = false; + Callback().Call({Env().Null()}); +}; + +void VLMInitWorker::OnError(const Error& e) { + *this->is_initializing = false; + Callback().Call({Napi::Error::New(Env(), e.Message()).Value()}); +}; diff --git a/src/js/src/vlm_pipeline/perf_metrics.cpp b/src/js/src/vlm_pipeline/perf_metrics.cpp new file mode 100644 index 0000000000..6e2a258df8 --- /dev/null +++ b/src/js/src/vlm_pipeline/perf_metrics.cpp @@ -0,0 +1,48 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "include/vlm_pipeline/perf_metrics.hpp" + +#include "include/addon.hpp" +#include "include/helper.hpp" + +using ov::genai::common_bindings::utils::get_ms; + +VLMPerfMetricsWrapper::VLMPerfMetricsWrapper(const Napi::CallbackInfo& info) + : BasePerfMetricsWrapper(info) {} + +Napi::Function VLMPerfMetricsWrapper::get_class(Napi::Env env) { + auto properties = BasePerfMetricsWrapper::get_class_properties(); + properties.push_back( + InstanceMethod("getPrepareEmbeddingsDuration", &VLMPerfMetricsWrapper::get_prepare_embeddings_duration)); + properties.push_back(InstanceAccessor<&VLMPerfMetricsWrapper::get_vlm_raw_metrics>("vlmRawMetrics")); + return DefineClass(env, "VLMPerfMetrics", properties); +} + +Napi::Object VLMPerfMetricsWrapper::wrap(Napi::Env env, const ov::genai::VLMPerfMetrics& metrics) { + const auto& prototype = env.GetInstanceData()->vlm_perf_metrics; + OPENVINO_ASSERT(prototype, "Invalid pointer to prototype."); + auto obj = prototype.New({}); + const auto m_ptr = Napi::ObjectWrap::Unwrap(obj); + m_ptr->_metrics = metrics; + return obj; +} + +Napi::Value VLMPerfMetricsWrapper::get_prepare_embeddings_duration(const Napi::CallbackInfo& info) { + VALIDATE_ARGS_COUNT(info, 0, "getPrepareEmbeddingsDuration()"); + return perf_utils::create_mean_std_pair(info.Env(), _metrics.get_prepare_embeddings_duration()); +} + +Napi::Value VLMPerfMetricsWrapper::get_raw_metrics(const Napi::CallbackInfo& info) { + return BasePerfMetricsWrapper::get_raw_metrics(info); +} + +Napi::Value VLMPerfMetricsWrapper::get_vlm_raw_metrics(const Napi::CallbackInfo& info) { + Napi::Object obj = Napi::Object::New(info.Env()); + obj.Set("prepareEmbeddingsDurations", + cpp_to_js, Napi::Value>( + info.Env(), + get_ms(_metrics.vlm_raw_metrics, &ov::genai::VLMRawPerfMetrics::prepare_embeddings_durations))); + + return obj; +} diff --git a/src/js/src/vlm_pipeline/start_chat_worker.cpp b/src/js/src/vlm_pipeline/start_chat_worker.cpp new file mode 100644 index 0000000000..bbce8cf210 --- /dev/null +++ b/src/js/src/vlm_pipeline/start_chat_worker.cpp @@ -0,0 +1,19 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "include/vlm_pipeline/start_chat_worker.hpp" + +VLMStartChatWorker::VLMStartChatWorker(Function& callback, + std::shared_ptr& pipe, + std::string system_message) + : AsyncWorker(callback), + pipe(pipe), + system_message(system_message) {}; + +void VLMStartChatWorker::Execute() { + this->pipe->start_chat(this->system_message); +}; + +void VLMStartChatWorker::OnOK() { + Callback().Call({Env().Null()}); +}; diff --git a/src/js/src/vlm_pipeline/vlm_pipeline_wrapper.cpp b/src/js/src/vlm_pipeline/vlm_pipeline_wrapper.cpp new file mode 100644 index 0000000000..cc1c14fbbc --- /dev/null +++ b/src/js/src/vlm_pipeline/vlm_pipeline_wrapper.cpp @@ -0,0 +1,287 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "include/vlm_pipeline/vlm_pipeline_wrapper.hpp" + +#include + +#include "include/addon.hpp" +#include "include/helper.hpp" +#include "include/tokenizer.hpp" +#include "include/vlm_pipeline/finish_chat_worker.hpp" +#include "include/vlm_pipeline/init_worker.hpp" +#include "include/vlm_pipeline/perf_metrics.hpp" +#include "include/vlm_pipeline/start_chat_worker.hpp" + +struct VLMTsfnContext { + VLMTsfnContext(std::string prompt, std::shared_ptr is_generating) + : prompt(prompt), + is_generating(is_generating) {}; + ~VLMTsfnContext() {}; + + std::thread native_thread; + Napi::ThreadSafeFunction callback; + std::optional streamer; + + std::string prompt; + std::vector images; + std::vector videos; + std::shared_ptr is_generating; + std::shared_ptr pipe = nullptr; + std::shared_ptr generation_config = nullptr; +}; + +void vlmPerformInferenceThread(VLMTsfnContext* context) { + auto report_error = [context](const std::string& message) { + auto status = context->callback.BlockingCall([message](Napi::Env env, Napi::Function jsCallback) { + try { + jsCallback.Call( + {Napi::Error::New(env, "vlmPerformInferenceThread error. " + message).Value(), env.Null()}); + } catch (std::exception& err) { + std::cerr << "The callback failed when attempting to return an error from vlmPerformInferenceThread. " + "Details:\n" + << err.what() << std::endl; + std::cerr << "Original error message:\n" << message << std::endl; + } + }); + if (status != napi_ok) { + std::cerr << "The BlockingCall failed with status " << status + << " when trying to return an error from vlmPerformInferenceThread." << std::endl; + std::cerr << "Original error message:\n" << message << std::endl; + } + }; + auto finalize = [context]() { + *context->is_generating = false; + context->callback.Release(); + if (context->streamer.has_value()) { + context->streamer->Release(); + } + }; + try { + ov::genai::GenerationConfig config; + config.update_generation_config(*context->generation_config); + + ov::genai::StreamerVariant streamer = std::monostate(); + std::vector streamer_exceptions; + if (context->streamer.has_value()) { + streamer = [context, &streamer_exceptions](std::string word) { + std::promise resultPromise; + napi_status status = context->streamer->BlockingCall( + [word, &resultPromise, &streamer_exceptions](Napi::Env env, Napi::Function jsCallback) { + try { + auto callback_result = jsCallback.Call({Napi::String::New(env, word)}); + if (callback_result.IsNumber()) { + resultPromise.set_value(static_cast( + callback_result.As().Int32Value())); + } else { + resultPromise.set_value(ov::genai::StreamingStatus::RUNNING); + } + } catch (std::exception& err) { + streamer_exceptions.push_back(err.what()); + resultPromise.set_value(ov::genai::StreamingStatus::CANCEL); + } + }); + + if (status != napi_ok) { + streamer_exceptions.push_back("The streamer callback BlockingCall failed with the status: " + + status); + return ov::genai::StreamingStatus::CANCEL; + } + + return resultPromise.get_future().get(); + }; + } + + ov::genai::VLMDecodedResults result; + + result = context->pipe->generate(context->prompt, context->images, context->videos, config, streamer); + + if (!streamer_exceptions.empty()) { + // If there were exceptions from the streamer, report them all as a single error and finish without result + std::string combined_error = "Streamer exceptions occurred:\n"; + for (size_t i = 0; i < streamer_exceptions.size(); ++i) { + combined_error += "[" + std::to_string(i + 1) + "] " + streamer_exceptions[i] + "\n"; + } + report_error(combined_error); + } else { + // If no exceptions from streamer, call the final callback with the result + napi_status status = + context->callback.BlockingCall([result, &report_error](Napi::Env env, Napi::Function jsCallback) { + try { + jsCallback.Call({ + env.Null(), // Error should be null in normal case + to_vlm_decoded_result(env, result) // Return DecodedResults as the final result + }); + } catch (std::exception& err) { + report_error("The final callback failed. Details:\n" + std::string(err.what())); + } + }); + + if (status != napi_ok) { + report_error("The final BlockingCall failed with status " + status); + } + } + finalize(); + } catch (std::exception& e) { + report_error(e.what()); + finalize(); + } +} + +VLMPipelineWrapper::VLMPipelineWrapper(const Napi::CallbackInfo& info) : Napi::ObjectWrap(info) {}; + +Napi::Function VLMPipelineWrapper::get_class(Napi::Env env) { + return DefineClass(env, + "VLMPipeline", + {InstanceMethod("init", &VLMPipelineWrapper::init), + InstanceMethod("generate", &VLMPipelineWrapper::generate), + InstanceMethod("getTokenizer", &VLMPipelineWrapper::get_tokenizer), + InstanceMethod("startChat", &VLMPipelineWrapper::start_chat), + InstanceMethod("finishChat", &VLMPipelineWrapper::finish_chat), + InstanceMethod("setChatTemplate", &VLMPipelineWrapper::set_chat_template), + InstanceMethod("setGenerationConfig", &VLMPipelineWrapper::set_generation_config)}); +} + +Napi::Value VLMPipelineWrapper::init(const Napi::CallbackInfo& info) { + auto env = info.Env(); + try { + OPENVINO_ASSERT(!this->pipe, "Pipeline is already initialized"); + OPENVINO_ASSERT(!*this->is_initializing, "Pipeline is already initializing"); + VALIDATE_ARGS_COUNT(info, 4, "init()"); + const std::string model_path = js_to_cpp(env, info[0]); + const std::string device = js_to_cpp(env, info[1]); + const auto& properties = js_to_cpp(env, info[2]); + OPENVINO_ASSERT(info[3].IsFunction(), "init callback is not a function"); + Napi::Function callback = info[3].As(); + + VLMInitWorker* asyncWorker = + new VLMInitWorker(callback, this->pipe, this->is_initializing, model_path, device, properties); + asyncWorker->Queue(); + } catch (const std::exception& ex) { + Napi::Error::New(env, ex.what()).ThrowAsJavaScriptException(); + } + return env.Undefined(); +} + +Napi::Value VLMPipelineWrapper::generate(const Napi::CallbackInfo& info) { + auto env = info.Env(); + try { + OPENVINO_ASSERT(this->pipe, "VLMPipeline is not initialized"); + OPENVINO_ASSERT(!*this->is_generating, "Another generation is already in progress"); + *this->is_generating = true; + VALIDATE_ARGS_COUNT(info, 6, "generate()"); + VLMTsfnContext* context = nullptr; + + // Arguments: prompt, images, videos, streamer, generationConfig, callback + auto prompt = js_to_cpp(env, info[0]); + auto images = js_to_cpp>(env, info[1]); + auto videos = js_to_cpp>(env, info[2]); + OPENVINO_ASSERT(info[3].IsFunction() || info[3].IsUndefined(), "generate callback is not a function"); + auto streamer = info[3].As(); + auto generation_config = js_to_cpp(env, info[4]); + OPENVINO_ASSERT(info[5].IsFunction(), "generate callback is not a function"); + auto callback = info[5].As(); + + context = new VLMTsfnContext(prompt, this->is_generating); + context->images = std::move(images); + context->videos = std::move(videos); + context->pipe = this->pipe; + context->generation_config = std::make_shared(generation_config); + + context->callback = + Napi::ThreadSafeFunction::New(env, + callback, // JavaScript function called asynchronously + "VLM_generate_callback", // Name + 0, // Unlimited queue + 1, // Only one thread will use this initially + [context, this](Napi::Env) { // Finalizer used to clean threads up + context->native_thread.join(); + delete context; + }); + if (!streamer.IsUndefined()) { + context->streamer = Napi::ThreadSafeFunction::New(env, + streamer, // JavaScript function called asynchronously + "VLM_generate_streamer", // Name + 0, // Unlimited queue + 1); // Only one thread will use this initially + } + context->native_thread = std::thread(vlmPerformInferenceThread, context); + } catch (const std::exception& ex) { + Napi::Error::New(env, ex.what()).ThrowAsJavaScriptException(); + *this->is_generating = false; + } + return env.Undefined(); +} + +Napi::Value VLMPipelineWrapper::start_chat(const Napi::CallbackInfo& info) { + auto env = info.Env(); + try { + OPENVINO_ASSERT(this->pipe, "VLMPipeline is not initialized"); + VALIDATE_ARGS_COUNT(info, 2, "startChat()"); + auto system_message = js_to_cpp(env, info[0]); + OPENVINO_ASSERT(info[1].IsFunction(), "startChat callback is not a function"); + auto callback = info[1].As(); + + VLMStartChatWorker* asyncWorker = new VLMStartChatWorker(callback, this->pipe, system_message); + asyncWorker->Queue(); + } catch (const std::exception& ex) { + Napi::Error::New(env, ex.what()).ThrowAsJavaScriptException(); + } + return env.Undefined(); +} + +Napi::Value VLMPipelineWrapper::finish_chat(const Napi::CallbackInfo& info) { + auto env = info.Env(); + try { + OPENVINO_ASSERT(this->pipe, "VLMPipeline is not initialized"); + VALIDATE_ARGS_COUNT(info, 1, "finishChat()"); + OPENVINO_ASSERT(info[0].IsFunction(), "finishChat callback is not a function"); + Napi::Function callback = info[0].As(); + + VLMFinishChatWorker* asyncWorker = new VLMFinishChatWorker(callback, this->pipe); + asyncWorker->Queue(); + } catch (const std::exception& ex) { + Napi::Error::New(env, ex.what()).ThrowAsJavaScriptException(); + } + return env.Undefined(); +} + +Napi::Value VLMPipelineWrapper::get_tokenizer(const Napi::CallbackInfo& info) { + auto env = info.Env(); + try { + OPENVINO_ASSERT(this->pipe, "VLMPipeline is not initialized"); + auto tokenizer = this->pipe->get_tokenizer(); + return TokenizerWrapper::wrap(env, tokenizer); + } catch (const std::exception& ex) { + Napi::Error::New(env, ex.what()).ThrowAsJavaScriptException(); + } + return env.Undefined(); +} + +Napi::Value VLMPipelineWrapper::set_chat_template(const Napi::CallbackInfo& info) { + auto env = info.Env(); + try { + OPENVINO_ASSERT(this->pipe, "VLMPipeline is not initialized"); + VALIDATE_ARGS_COUNT(info, 1, "setChatTemplate()"); + auto chat_template = js_to_cpp(env, info[0]); + this->pipe->set_chat_template(chat_template); + } catch (const std::exception& ex) { + Napi::Error::New(env, ex.what()).ThrowAsJavaScriptException(); + } + return env.Undefined(); +} + +Napi::Value VLMPipelineWrapper::set_generation_config(const Napi::CallbackInfo& info) { + auto env = info.Env(); + try { + OPENVINO_ASSERT(this->pipe, "VLMPipeline is not initialized"); + VALIDATE_ARGS_COUNT(info, 1, "setGenerationConfig()"); + auto config_map = js_to_cpp(env, info[0]); + ov::genai::GenerationConfig config; + config.update_generation_config(config_map); + this->pipe->set_generation_config(config); + } catch (const std::exception& ex) { + Napi::Error::New(env, ex.what()).ThrowAsJavaScriptException(); + } + return env.Undefined(); +} diff --git a/src/js/tests/models.js b/src/js/tests/models.js index f3daf59986..5fef7cda40 100644 --- a/src/js/tests/models.js +++ b/src/js/tests/models.js @@ -2,4 +2,5 @@ export const models = { LLM: "OpenVINO/Llama-3.1-8B-Instruct-FastDraft-150M-int8-ov", InstructLLM: "OpenVINO/Qwen2.5-1.5B-Instruct-int4-ov", Embedding: "OpenVINO/bge-base-en-v1.5-fp16-ov", + VLM: "OpenVINO/Qwen2-VL-7B-Instruct-int4-ov", }; diff --git a/src/js/tests/utils.js b/src/js/tests/utils.js index be886b4bcb..0416941d99 100644 --- a/src/js/tests/utils.js +++ b/src/js/tests/utils.js @@ -1,6 +1,7 @@ import { bootstrap } from "global-agent"; import { promises as fs } from "node:fs"; import { listFiles, downloadFile } from "@huggingface/hub"; +import { addon as ov } from "openvino-node"; const BASE_DIR = "./tests/models/"; @@ -45,3 +46,60 @@ async function saveFile(file, response) { await fs.writeFile(file, Buffer.from(arrayBuffer)); } + +/** + * Creates a synthetic test image tensor with a gradient pattern. + * + * Generates a small RGB image filled with a gradient pattern for testing VLM pipelines. + * The red channel varies by height, green by width, and blue is constant. + * + * @param height - Height of the image in pixels. (default: 32) + * @param width - Width of the image in pixels. (default: 32) + * @returns An OpenVINO Tensor with shape [height, width, channels] and uint8 data type. + */ +export function createTestImageTensor(height = 32, width = 32) { + const channels = 3; + const data = new Uint8Array(height * width * channels); + + // Fill with gradient pattern + for (let h = 0; h < height; h++) { + for (let w = 0; w < width; w++) { + const idx = (h * width + w) * channels; + data[idx] = h * 8; // R + data[idx + 1] = w * 8; // G + data[idx + 2] = 128; // B + } + } + + return new ov.Tensor("u8", [height, width, channels], data); +} + +/** + * Creates a synthetic test video tensor with multiple frames. + * + * Generates a video tensor with a synthetic pattern that varies across frames. + * Each frame has a slightly different color pattern to simulate temporal variation. + * Useful for testing VLM pipelines with video inputs. + * + * @param frames - Number of video frames to generate. (default: 4) + * @param height - Height of each frame in pixels. (default: 32) + * @param width - Width of each frame in pixels. (default: 32) + * @returns An OpenVINO Tensor with shape [frames, height, width, channels] and uint8 data type. + */ +export function createTestVideoTensor(frames = 4, height = 32, width = 32) { + const channels = 3; + const data = new Uint8Array(frames * height * width * channels); + + for (let f = 0; f < frames; f++) { + for (let h = 0; h < height; h++) { + for (let w = 0; w < width; w++) { + const idx = (f * height * width + h * width + w) * channels; + data[idx] = (h + f * 10) % 256; + data[idx + 1] = (w + f * 10) % 256; + data[idx + 2] = 128; + } + } + } + + return new ov.Tensor("u8", [frames, height, width, channels], data); +} diff --git a/src/js/tests/vlmPipeline.test.js b/src/js/tests/vlmPipeline.test.js new file mode 100644 index 0000000000..8611b772e8 --- /dev/null +++ b/src/js/tests/vlmPipeline.test.js @@ -0,0 +1,153 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +import { Tokenizer, VLMPipeline, DecodedResults, VLMDecodedResults } from "../dist/index.js"; + +import assert from "node:assert"; +import { describe, it, before } from "node:test"; +import { models } from "./models.js"; +import { createTestImageTensor, createTestVideoTensor } from "./utils.js"; + +const MODEL_PATH = process.env.VLM_MODEL_PATH || `./tests/models/${models.VLM.split("/")[1]}`; + +// Skip tests on macOS due to insufficient memory +describe("VLMPipeline", { skip: process.platform === "darwin" }, () => { + let pipeline, testImage1, testImage2, testVideo1, testVideo2; + + before(async () => { + pipeline = await VLMPipeline(MODEL_PATH, "CPU"); + pipeline.setGenerationConfig({ max_new_tokens: 10 }); + testImage1 = createTestImageTensor(); + testImage2 = createTestImageTensor(50, 50); + testVideo1 = createTestVideoTensor(); + testVideo2 = createTestVideoTensor(6, 64, 64); + }); + + it("should generate text without images", async () => { + const result = await pipeline.generate("What is 2+2?"); + + assert.ok(result instanceof DecodedResults, "Result should be instance of DecodedResults"); + assert.ok( + result instanceof VLMDecodedResults, + "Result should be instance of VLMDecodedResults", + ); + assert.ok(result.texts.length > 0, "Should generate some output"); + }); + + it("should generate text with images", async () => { + const result = await pipeline.generate("Compare these two images.", { + images: [testImage1, testImage2], + }); + + assert.strictEqual(result.texts.length, 1, "Should generate comparison"); + }); + + it("should generate text with video input", async () => { + const result = await pipeline.generate("Describe what happens in this video.", { + videos: [testVideo1], + generationConfig: { + max_new_tokens: 20, + temperature: 0, + }, + }); + + assert.strictEqual(result.texts.length, 1); + }); + + it("should generate with both image and video", async () => { + const result = await pipeline.generate("Compare the image and video.", { + images: [testImage1], + videos: [testVideo2], + generationConfig: { max_new_tokens: 20, temperature: 0 }, + }); + + assert.strictEqual(result.texts.length, 1); + }); + + it("throw error on invalid streamer", async () => { + await assert.rejects( + pipeline.generate("What is 2+2?", { + streamer: () => { + throw new Error("Test error"); + }, + }), + /Test error/, + ); + }); + + it("throw error with invalid generationConfig", async () => { + await assert.rejects( + pipeline.generate("What is 2+2?", { + generationConfig: { max_new_tokens: "five" }, + }), + /vlmPerformInferenceThread error/, + ); + }); + + it("should support streaming generation", async () => { + const chunks = []; + + const stream = pipeline.stream("What do you see?", { + images: [testImage1], + generationConfig: { + max_new_tokens: 15, + temperature: 0, + }, + }); + + for await (const chunk of stream) { + chunks.push(chunk); + } + + assert.ok(chunks.length > 0, "Should receive streaming chunks"); + const fullOutput = chunks.join(""); + assert.ok(fullOutput.length > 0, "Combined chunks should form output"); + }); + + it("should return VLMDecodedResults with perfMetrics", async () => { + const result = await pipeline.generate("Describe the image.", { + images: [testImage2], + generationConfig: { + max_new_tokens: 10, + temperature: 0, + }, + }); + + assert.ok(result, "Should return result"); + assert.ok(result.perfMetrics, "Should have perfMetrics"); + // Property from base PerformanceMetrics + const numTokens = result.perfMetrics.getNumGeneratedTokens(); + assert.ok(typeof numTokens === "number", "getNumGeneratedTokens should return number"); + assert.ok( + 0 < numTokens && numTokens <= 10, + "Number of tokens should be between 0 and max_new_tokens", + ); + // VLM-specific properties + const prepareEmbeddings = result.perfMetrics.getPrepareEmbeddingsDuration(); + assert.ok( + typeof prepareEmbeddings.mean === "number", + "PrepareEmbeddingsDuration should have mean", + ); + const { prepareEmbeddingsDurations } = result.perfMetrics.vlmRawMetrics; + assert.ok( + Array.isArray(prepareEmbeddingsDurations), + "Should have duration of preparation of embeddings", + ); + assert.ok(prepareEmbeddingsDurations.length > 0, "Should have at least one duration value"); + }); + + it("should get tokenizer from pipeline", () => { + const tokenizer = pipeline.getTokenizer(); + assert.ok(tokenizer instanceof Tokenizer, "Should return tokenizer"); + }); + + it("should start and finish chat", async () => { + await pipeline.startChat("You are an assistant named Tom."); + const result1 = await pipeline.generate("What is your name?"); + assert.ok(/Tom/.test(result1.toString())); + + await pipeline.finishChat(); + const result2 = await pipeline.generate("What is your name?"); + assert.ok(!/Tom/.test(result2.toString())); + }); +});