diff --git a/src/cpp/src/visual_language/llava_next_video/classes.cpp b/src/cpp/src/visual_language/llava_next_video/classes.cpp index 907f78ecbd..a901252ac7 100644 --- a/src/cpp/src/visual_language/llava_next_video/classes.cpp +++ b/src/cpp/src/visual_language/llava_next_video/classes.cpp @@ -4,14 +4,204 @@ #include "visual_language/llava_next_video/classes.hpp" #include "visual_language/clip.hpp" #include "visual_language/processor_config.hpp" +#include "openvino/opsets/opset13.hpp" namespace ov::genai { +namespace { + +std::shared_ptr create_bicubic_resize(std::shared_ptr input, std::shared_ptr target_size) { + using namespace ov::op; + + // Convert to float32 before interpolation (required for bicubic) + auto input_f32 = std::make_shared(input, ov::element::f32); + + // For NHWC format, resize axes are [1, 2] (height, width dimensions) + auto axes = v0::Constant::create(ov::element::i64, ov::Shape{2}, std::vector{1, 2}); + + v11::Interpolate::InterpolateAttrs attrs; + attrs.mode = v11::Interpolate::InterpolateMode::CUBIC; + attrs.shape_calculation_mode = v11::Interpolate::ShapeCalcMode::SIZES; + attrs.coordinate_transformation_mode = v11::Interpolate::CoordinateTransformMode::ASYMMETRIC; + attrs.cube_coeff = -0.5f; // Catmull-Rom bicubic coefficient (a = -0.5), chosen to match CPU preprocessing + attrs.nearest_mode = v11::Interpolate::NearestMode::FLOOR; + attrs.pads_begin = {0, 0}; + attrs.pads_end = {0, 0}; + attrs.antialias = false; + + return std::make_shared(input_f32, target_size, axes, attrs); +} + +std::shared_ptr create_mean_scale(std::shared_ptr input_u8_or_f32, const ProcessorConfig& config) { + using namespace ov::op; + + std::shared_ptr input_f32; + + // Convert to float32 if input is uint8, otherwise use as-is + if (input_u8_or_f32->get_element_type() == ov::element::u8) { + input_f32 = std::make_shared(input_u8_or_f32, ov::element::f32); + } else { + input_f32 = input_u8_or_f32; + } + + // Follow the original mean_scale() function logic exactly, in tensor form: + // Per-element, per-channel normalization: + // (float(x) / 255.0f - config.image_mean[c]) / config.image_std[c], implemented via OV ops with broadcasting. + // Step 1: x / 255.0 + auto scale_255 = v0::Constant::create(ov::element::f32, ov::Shape{}, std::vector{255.0f}); + auto divided_by_255 = std::make_shared(input_f32, scale_255); + + // Step 2: Create mean and std constants [R, G, B] - broadcasted along channel dimension + // For NHWC format, we need shape [1, 1, 1, 3] to broadcast correctly + auto mean_const = v0::Constant::create(ov::element::f32, ov::Shape{1, 1, 1, 3}, + std::vector{config.image_mean[0], config.image_mean[1], config.image_mean[2]}); + auto std_const = v0::Constant::create(ov::element::f32, ov::Shape{1, 1, 1, 3}, + std::vector{config.image_std[0], config.image_std[1], config.image_std[2]}); + + // Step 3: (x/255.0 - mean) + auto mean_subtracted = std::make_shared(divided_by_255, mean_const); + + // Step 4: (x/255.0 - mean) / std + auto result = std::make_shared(mean_subtracted, std_const); + + return result; +} + +std::shared_ptr create_channels_first(std::shared_ptr input_nhwc) { + using namespace ov::op; + + // Transpose from NHWC (0,1,2,3) to NCHW (0,3,1,2) + auto transpose_order = v0::Constant::create(ov::element::i64, ov::Shape{4}, std::vector{0, 3, 1, 2}); + return std::make_shared(input_nhwc, transpose_order); +} + +std::shared_ptr create_center_crop(std::shared_ptr input, std::shared_ptr crop_size) { + using namespace ov::op; + + // Extract crop height and width from crop_size + auto gather_axis = v0::Constant::create(ov::element::i64, ov::Shape{}, std::vector{0}); + auto idx_0 = v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector{0}); + auto idx_1 = v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector{1}); + auto idx_2 = v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector{2}); + auto crop_height = std::make_shared(crop_size, idx_0, gather_axis); + auto crop_width = std::make_shared(crop_size, idx_1, gather_axis); + + // Get input shape + auto shape_node = std::make_shared(input); + auto H = std::make_shared(shape_node, idx_1, gather_axis); + auto W = std::make_shared(shape_node, idx_2, gather_axis); + + // Calculate start positions: start_y = (H - crop_height) / 2, start_x = (W - crop_width) / 2 + auto const_2 = v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector{2}); + auto start_y = std::make_shared(std::make_shared(H, crop_height), const_2); + auto start_x = std::make_shared(std::make_shared(W, crop_width), const_2); + + // Calculate end positions: end_y = start_y + crop_height, end_x = start_x + crop_width + auto end_y = std::make_shared(start_y, crop_height); + auto end_x = std::make_shared(start_x, crop_width); + + // Create slice start and stop vectors + auto zero = v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector{0}); + auto max_val = v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector{std::numeric_limits::max()}); + + // start = [0, start_y, start_x, 0] + auto start = std::make_shared(ov::NodeVector{zero, start_y, start_x, zero}, 0); + // stop = [max, end_y, end_x, max] + auto stop = std::make_shared(ov::NodeVector{max_val, end_y, end_x, max_val}, 0); + // step = [1, 1, 1, 1] + auto step = v0::Constant::create(ov::element::i64, ov::Shape{4}, std::vector{1, 1, 1, 1}); + + // Apply slice + auto sliced = std::make_shared(input, start, stop, step); + + return sliced; +} + +// Helper function to calculate resize dimensions based on shortest edge +ImageSize calculate_resize_dimensions( + const ImageSize& original_size, + size_t target_shortest_edge) { + float scale = static_cast(target_shortest_edge) / std::min(original_size.height, original_size.width); + size_t new_height = static_cast(original_size.height * scale); + size_t new_width = static_cast(original_size.width * scale); + return {new_height, new_width}; +} + +// Helper function to set preprocessing parameters for integrated OV preprocessing model +void set_preprocess_parameters( + ov::InferRequest& encoder, + const ov::Tensor& input_frames, + const ImageSize& original_size, + const ProcessorConfig& config) { + + // Calculate resize target size + auto resized_size = calculate_resize_dimensions(original_size, config.size_shortest_edge); + + // Set resize target size + ov::Tensor target_size_tensor(ov::element::i64, {2}); + target_size_tensor.data()[0] = resized_size.height; + target_size_tensor.data()[1] = resized_size.width; + + // Set crop size + ov::Tensor crop_size_tensor(ov::element::i64, {2}); + crop_size_tensor.data()[0] = config.crop_size_height; + crop_size_tensor.data()[1] = config.crop_size_width; + + encoder.set_input_tensor(0, input_frames); + encoder.set_input_tensor(1, target_size_tensor); + encoder.set_input_tensor(2, crop_size_tensor); +} + +bool can_use_ov_vision_preprocess() { + const char* env = std::getenv("VISION_PREPROCESS"); + return !(env && std::string(env) == "CPP"); +} + +std::shared_ptr patch_preprocess_into_vision_encoder_model( + const std::shared_ptr& vision_encoder_model, + const ProcessorConfig& config) { + using namespace ov; + using namespace ov::op; + + // Input: concatenated image/video frames in NHWC format (uint8) + // Shape: {num_frames, -1, -1, 3} => {batch=num_frames, height=dynamic, width=dynamic, channels=3 (RGB)} + auto input_frames = std::make_shared(element::u8, PartialShape{-1, -1, -1, 3}); + input_frames->set_friendly_name("input_frames"); + + // Target size for bicubic resize [height, width] + auto resize_target_size = std::make_shared(element::i64, PartialShape{2}); + resize_target_size->set_friendly_name("resize_target_size"); + + // Crop size [height, width] + auto crop_size = std::make_shared(element::i64, PartialShape{2}); + crop_size->set_friendly_name("crop_size"); + + // Apply preprocessing operations + auto resized = create_bicubic_resize(input_frames, resize_target_size); + auto cropped = create_center_crop(resized, crop_size); + auto normalized = create_mean_scale(cropped, config); + auto preprocessed = create_channels_first(normalized); + + // Connect preprocessing output to vision encoder input + auto vision_params = vision_encoder_model->get_parameters(); + auto vision_results = vision_encoder_model->get_results(); + + // Replace pixel_values parameter with preprocessing output + vision_params[0]->output(0).replace(preprocessed); + + return std::make_shared( + vision_results, + ParameterVector{input_frames, resize_target_size, crop_size} + ); +} + +} // namespace + std::pair get_unpadded_features(size_t height, size_t width, size_t patches_height, size_t patches_width, size_t scale_height, size_t scale_width) { size_t current_height = patches_height * scale_height; size_t current_width = patches_width * scale_width; - + float original_aspect_ratio = (float)width / height; float current_aspect_ratio = (float)current_width / current_height; if (original_aspect_ratio > current_aspect_ratio) { @@ -33,11 +223,8 @@ std::pair get_unpadded_features(size_t height, size_t width, siz clip_image_f32 preprocess_clip_image_llava_next_video(const clip_image_u8& image, ProcessorConfig& config) { // Resize clip_image_u8 resized_image; - int target_size = config.size_shortest_edge; - float scale = static_cast(target_size) / std::min(image.nx, image.ny); - int new_width = static_cast(image.nx * scale); - int new_height = static_cast(image.ny * scale); - bicubic_resize(image, resized_image, new_width, new_height); + auto resized_size = calculate_resize_dimensions({static_cast(image.ny), static_cast(image.nx)}, config.size_shortest_edge); + bicubic_resize(image, resized_image, static_cast(resized_size.width), static_cast(resized_size.height)); // Center crop clip_image_u8 cropped_image = center_crop(resized_image, config.crop_size_height, config.crop_size_width); @@ -58,7 +245,21 @@ clip_image_f32 preprocess_clip_image_llava_next_video(const clip_image_u8& image VisionEncoderLLaVANextVideo::VisionEncoderLLaVANextVideo( const std::filesystem::path& model_dir, const std::string& device, - const ov::AnyMap properties) : VisionEncoderLLaVANext(model_dir, device, properties) { + const ov::AnyMap properties) : VisionEncoderLLaVANext(model_dir, device, properties), + use_ov_vision_preprocess(can_use_ov_vision_preprocess()) { + if (use_ov_vision_preprocess) { + // Create integrated preprocessing + vision encoder model for image/video processing + auto vision_encoder_model = utils::singleton_core().read_model(model_dir / "openvino_vision_embeddings_model.xml"); + auto model = patch_preprocess_into_vision_encoder_model(vision_encoder_model, m_processor_config); + auto compiled_model = utils::singleton_core().compile_model(model, device, properties); + // Overwrite vision encoder queue with integrated model + m_ireq_queue_vision_encoder = std::make_unique>( + compiled_model.get_property(ov::optimal_number_of_infer_requests), + [&compiled_model]() -> ov::InferRequest { + return compiled_model.create_infer_request(); + }); + } + auto compiled_model = utils::singleton_core().compile_model(model_dir / "openvino_multi_modal_projector_model.xml", device, {}); m_ireq_queue_multi_modal_projector = std::make_unique>( compiled_model.get_property(ov::optimal_number_of_infer_requests), @@ -76,10 +277,25 @@ VisionEncoderLLaVANextVideo::VisionEncoderLLaVANextVideo( } VisionEncoderLLaVANextVideo::VisionEncoderLLaVANextVideo( - const ModelsMap& models_map, - const std::filesystem::path& config_dir_path, - const std::string& device, - const ov::AnyMap device_config) : VisionEncoderLLaVANext{models_map, config_dir_path, device, device_config} { + const ModelsMap& models_map, + const std::filesystem::path& config_dir_path, + const std::string& device, + const ov::AnyMap device_config) : VisionEncoderLLaVANext{models_map, config_dir_path, device, device_config}, + use_ov_vision_preprocess(can_use_ov_vision_preprocess()) { + if (use_ov_vision_preprocess) { + // Create integrated preprocessing + vision encoder model for image/video processing + const auto& [vision_encoder_model, vision_encoder_weights] = utils::get_model_weights_pair(models_map, "vision_embeddings"); + auto vision_encoder_model_original = utils::singleton_core().read_model(vision_encoder_model, vision_encoder_weights); + auto model = patch_preprocess_into_vision_encoder_model(vision_encoder_model_original, m_processor_config); + auto compiled_model = utils::singleton_core().compile_model(model, device, device_config); + // Overwrite vision encoder queue with integrated model + m_ireq_queue_vision_encoder = std::make_unique>( + compiled_model.get_property(ov::optimal_number_of_infer_requests), + [&compiled_model]() -> ov::InferRequest { + return compiled_model.create_infer_request(); + }); + } + const auto& resampler_model = utils::get_model_weights_pair(models_map, "resampler").first; const auto& resampler_weights = utils::get_model_weights_pair(models_map, "resampler").second; const auto& mm_projector_model = utils::get_model_weights_pair(models_map, "multi_modal_projector").first; @@ -111,12 +327,45 @@ EncodedImage VisionEncoderLLaVANextVideo::encode(const ov::Tensor& image, const ov::InferRequest& mm_projector = infer_request_guard_mm_projector.get(); ProcessorConfig config = utils::from_any_map(config_map, m_processor_config); - // preprocess image - ov::Tensor pixel_values = get_pixel_values_llava_next(image, config); - auto pixel_values_shape = pixel_values.get_shape(); + ov::Shape pixel_values_shape; + if (use_ov_vision_preprocess) { + // Use integrated OV preprocessing model with batch processing similar to get_pixel_values_llava_next + clip_image_u8 input_image = tensor_to_clip_image_u8(image); + + std::pair size{config.size_shortest_edge, config.size_shortest_edge}; + auto patch_size = config.crop_size_height; + auto image_patches = get_image_patches(input_image, config.image_grid_pinpoints, size, patch_size); - // infer vision eztracting models - encoder.set_tensor("pixel_values", pixel_values); + size_t num_patches = image_patches.size(); + + // Get dimensions from first patch + size_t patch_height = image_patches[0].ny; + size_t patch_width = image_patches[0].nx; + + // Concatenate all patches into a single batch tensor (similar to preprocess_frames_ov) + ov::Shape concat_shape = {num_patches, patch_height, patch_width, 3}; + ov::Tensor concatenated_patches(ov::element::u8, concat_shape); + + uint8_t* concat_data = concatenated_patches.data(); + for (size_t i = 0; i < num_patches; i++) { + // clip_image_u8 has layout HWC, copy directly + std::memcpy(concat_data, image_patches[i].buf.data(), image_patches[i].buf.size()); + concat_data += image_patches[i].buf.size(); + } + + // Set inputs for integrated preprocessing model + set_preprocess_parameters(encoder, concatenated_patches, {patch_height, patch_width}, config); + + // Set pixel_values_shape for later use + pixel_values_shape = {num_patches, 3, static_cast(config.crop_size_height), static_cast(config.crop_size_width)}; + } else { + // Use CPU preprocessing + ov::Tensor pixel_values = get_pixel_values_llava_next(image, config); + pixel_values_shape = pixel_values.get_shape(); + encoder.set_tensor("pixel_values", pixel_values); + } + + // infer vision extracting models encoder.infer(); mm_projector.set_tensor("image_features", encoder.get_tensor("last_hidden_state")); mm_projector.infer(); @@ -204,9 +453,8 @@ ov::Tensor InputsEmbedderLLaVANextVideo::get_inputs_embeds( const std::vector& images_sequence, const std::vector& videos_sequence, const std::vector>& history_vision_count) { - + ov::Tensor image_newline; - size_t searched_pos = 0; std::vector image_embeds; for (size_t new_image_id : images_sequence) { const EncodedImage& encoded_image = images.at(new_image_id); @@ -255,56 +503,73 @@ ov::Tensor InputsEmbedderLLaVANextVideo::get_inputs_embeds( return text_embeds; } -std::pair, size_t> VisionEncoderLLaVANextVideo::preprocess_frames(const std::vector& frames) { - std::vector res; - - // preprocess frames - ProcessorConfig config = utils::from_any_map({}, m_processor_config); +ov::Tensor VisionEncoderLLaVANextVideo::preprocess_frames_cpp(const std::vector& frames) { + ProcessorConfig config = get_processor_config(); size_t num_frames = frames.size(); - for (size_t i=0; i < num_frames; i++) { + std::vector preprocessed_frames; + preprocessed_frames.reserve(num_frames); + + // Preprocess frames using CPU + for (size_t i = 0; i < num_frames; i++) { clip_image_u8 clip_image = tensor_to_clip_image_u8(frames[i]); auto preprocessed = preprocess_clip_image_llava_next_video(clip_image, config); - auto preprocessed_tensor = clip_image_f32_to_tensor(preprocessed); - res.push_back(preprocessed_tensor); - + preprocessed_frames.push_back(clip_image_f32_to_tensor(preprocessed)); } - ov::Shape resized_shape = res[0].get_shape(); - size_t height = resized_shape[2]; - size_t width = resized_shape[3]; - - size_t num_video_tokens = ((float)height / m_patch_size) * ((float)width / m_patch_size); - num_video_tokens = num_video_tokens / 4 * num_frames; + // Concatenate preprocessed frames to single tensor + const ov::Shape& first_shape = preprocessed_frames[0].get_shape(); + ov::Shape concat_shape = first_shape; + concat_shape[0] = num_frames; + ov::Tensor concatenated_frames(preprocessed_frames[0].get_element_type(), concat_shape); + + size_t frame_byte_size = preprocessed_frames[0].get_byte_size(); + float* frames_data = concatenated_frames.data(); + for (size_t i = 0; i < num_frames; i++) { + std::memcpy(frames_data, preprocessed_frames[i].data(), frame_byte_size); + frames_data += ov::shape_size(first_shape); + } - return {res, num_video_tokens}; + return concatenated_frames; } std::vector InputsEmbedderLLaVANextVideo::encode_videos(const std::vector& videos) { + auto vision_encoder = std::static_pointer_cast(m_vision_encoder); + auto config = vision_encoder->get_processor_config(); + std::vector encoded_videos; for (const auto video: videos) { std::vector frames = to_single_image_tensors({video}); - auto vision_encoder = std::static_pointer_cast(m_vision_encoder); - auto [prepprocessed_frames, num_video_tokens] = vision_encoder->preprocess_frames(frames); - - // concat preprocessed frames to single tensor - ov::Shape concat_shape = prepprocessed_frames[0].get_shape(); - concat_shape[0] = prepprocessed_frames.size(); - ov::Tensor concatinated_frames = ov::Tensor(prepprocessed_frames[0].get_element_type(), concat_shape); - float* frames_data = concatinated_frames.data(); - for (size_t i = 0; i < prepprocessed_frames.size(); i++) { - memcpy(frames_data, prepprocessed_frames[i].data(), prepprocessed_frames[i].get_byte_size()); - frames_data+=ov::shape_size(prepprocessed_frames[i].get_shape()); - } + size_t num_frames = frames.size(); + + // Calculate num_video_tokens (same for both OV and CPU preprocessing) + size_t num_video_tokens = ((config.crop_size_height / vision_encoder->get_patch_size()) * + (config.crop_size_width / vision_encoder->get_patch_size()) / 4) * num_frames; // infer video feature extraction models CircularBufferQueueElementGuard infer_request_guard(vision_encoder->get_vision_encoder()); ov::InferRequest& encoder = infer_request_guard.get(); - CircularBufferQueueElementGuard infer_request_guard_mm_projector(vision_encoder->get_multi_modal_projector()); - ov::InferRequest& mm_projector = infer_request_guard_mm_projector.get(); + + if (vision_encoder->get_use_ov_vision_preprocess()) { + // Use integrated OV preprocessing model - pass video tensor directly + auto frame_shape = frames[0].get_shape(); + size_t orig_height = frame_shape[1]; + size_t orig_width = frame_shape[2]; + + // Set inputs for integrated model + set_preprocess_parameters(encoder, video, {orig_height, orig_width}, config); + } else { + // Use CPU preprocessing - preprocess and concatenate frames + ov::Tensor concatenated_frames = vision_encoder->preprocess_frames_cpp(frames); + encoder.set_tensor("pixel_values", concatenated_frames); + } + + encoder.infer(); + CircularBufferQueueElementGuard infer_request_guard_resampler(vision_encoder->get_vision_resampler()); ov::InferRequest& resampler = infer_request_guard_resampler.get(); - encoder.set_tensor("pixel_values", concatinated_frames); - encoder.infer(); + CircularBufferQueueElementGuard infer_request_guard_mm_projector(vision_encoder->get_multi_modal_projector()); + ov::InferRequest& mm_projector = infer_request_guard_mm_projector.get(); + resampler.set_input_tensor(encoder.get_tensor("last_hidden_state")); resampler.infer(); mm_projector.set_tensor("image_features", resampler.get_output_tensor()); @@ -359,4 +624,4 @@ NormalizedPrompt InputsEmbedderLLaVANextVideo::normalize_prompt(const std::strin return {std::move(unified_prompt), std::move(images_sequence), std::move(video_sequence)}; } -} // namespace ov::genai \ No newline at end of file +} // namespace ov::genai diff --git a/src/cpp/src/visual_language/llava_next_video/classes.hpp b/src/cpp/src/visual_language/llava_next_video/classes.hpp index 90d41cd403..0751eb0835 100644 --- a/src/cpp/src/visual_language/llava_next_video/classes.hpp +++ b/src/cpp/src/visual_language/llava_next_video/classes.hpp @@ -15,7 +15,7 @@ class VisionEncoderLLaVANextVideo : public VisionEncoderLLaVANext { EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override; - std::pair, size_t> preprocess_frames(const std::vector& frames); + ov::Tensor preprocess_frames_cpp(const std::vector& frames); VisionEncoderLLaVANextVideo(const std::filesystem::path& model_dir, const std::string& device, const ov::AnyMap properties); @@ -36,10 +36,19 @@ class VisionEncoderLLaVANextVideo : public VisionEncoderLLaVANext { return m_ireq_queue_vision_resampler.get(); } + bool get_use_ov_vision_preprocess() const { + return use_ov_vision_preprocess; + } + + size_t get_patch_size() const { + return m_patch_size; + } + private: std::unique_ptr> m_ireq_queue_multi_modal_projector; std::unique_ptr> m_ireq_queue_vision_resampler; size_t m_patch_size; + bool use_ov_vision_preprocess = true; // default use ov vision preprocessing, control by env VISION_PREPROCESS=CPP to use CPU vision preprocessing }; class InputsEmbedderLLaVANextVideo : public InputsEmbedderLLaVANext { diff --git a/src/cpp/src/visual_language/phi3_vision/classes.cpp b/src/cpp/src/visual_language/phi3_vision/classes.cpp index 698fb595cd..cf5b010a4b 100644 --- a/src/cpp/src/visual_language/phi3_vision/classes.cpp +++ b/src/cpp/src/visual_language/phi3_vision/classes.cpp @@ -888,7 +888,7 @@ EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMa ImageSize image_size; - if (use_ov_image_preprocess) { + if (use_ov_vision_preprocess) { ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops); image_size = ImageSize{hd_image.get_shape().at(2), hd_image.get_shape().at(1)}; @@ -921,8 +921,8 @@ EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMa return encoded_image; } -bool can_use_ov_image_preprocess() { - const char* env = std::getenv("IMAGE_PREPROCESS"); +bool can_use_ov_vision_preprocess() { + const char* env = std::getenv("VISION_PREPROCESS"); return !(env && std::string(env) == "CPP"); } @@ -930,8 +930,8 @@ VisionEncoderPhi3V::VisionEncoderPhi3V(const std::filesystem::path& model_dir, const std::string& device, const ov::AnyMap properties) : VisionEncoder(model_dir, device, properties), - use_ov_image_preprocess(can_use_ov_image_preprocess()) { - if (use_ov_image_preprocess) { + use_ov_vision_preprocess(can_use_ov_vision_preprocess()) { + if (use_ov_vision_preprocess) { auto vision_encoder_model = utils::singleton_core().read_model(model_dir / "openvino_vision_embeddings_model.xml"); auto model = patch_image_preprocess_into_vision_encoder_model(vision_encoder_model, m_processor_config); auto compiled_model = utils::singleton_core().compile_model(model, device, properties); @@ -963,8 +963,8 @@ VisionEncoderPhi3V::VisionEncoderPhi3V(const ModelsMap& models_map, const std::string& device, const ov::AnyMap properties) : VisionEncoder(models_map, config_dir_path, device, properties), - use_ov_image_preprocess(can_use_ov_image_preprocess()) { - if (use_ov_image_preprocess) { + use_ov_vision_preprocess(can_use_ov_vision_preprocess()) { + if (use_ov_vision_preprocess) { const auto& [vision_encoder_model, vision_encoder_weights] = utils::get_model_weights_pair(models_map, "vision_embeddings"); auto model_org = utils::singleton_core().read_model(vision_encoder_model, vision_encoder_weights); auto model = patch_image_preprocess_into_vision_encoder_model(model_org, m_processor_config); diff --git a/src/cpp/src/visual_language/phi3_vision/classes.hpp b/src/cpp/src/visual_language/phi3_vision/classes.hpp index 5642d72ccc..6a5b38866e 100644 --- a/src/cpp/src/visual_language/phi3_vision/classes.hpp +++ b/src/cpp/src/visual_language/phi3_vision/classes.hpp @@ -42,7 +42,7 @@ class VisionEncoderPhi3V : public VisionEncoder { EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override; private: - bool use_ov_image_preprocess = true; // default use ov image preprocessing, control by env IMAGE_PREPROCESS=CPP to use CPU image preprocessing + bool use_ov_vision_preprocess = true; // default use ov vision preprocessing, control by env VISION_PREPROCESS=CPP to use CPU vision preprocessing }; diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 86808c93af..c9371a8dd4 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -649,8 +649,8 @@ std::unique_ptr> create_vision_encoder_ire }); } -bool check_image_preprocess_env() { - const char* env = std::getenv("IMAGE_PREPROCESS"); +bool check_vision_preprocess_env() { + const char* env = std::getenv("VISION_PREPROCESS"); return !(env && std::string(env) == "CPP"); } @@ -658,8 +658,8 @@ VisionEncoderQwen2VL::VisionEncoderQwen2VL(const std::filesystem::path& model_di const std::string& device, const ov::AnyMap properties) : VisionEncoder(model_dir, device, properties), - use_ov_image_preprocess(check_image_preprocess_env()) { - if (use_ov_image_preprocess) { + use_ov_vision_preprocess(check_vision_preprocess_env()) { + if (use_ov_vision_preprocess) { auto model_org = utils::singleton_core().read_model(model_dir / "openvino_vision_embeddings_model.xml"); m_ireq_queue_vision_encoder = create_vision_encoder_ireq(model_org, m_processor_config, device, properties); } @@ -670,8 +670,8 @@ VisionEncoderQwen2VL::VisionEncoderQwen2VL(const ModelsMap& models_map, const std::string& device, const ov::AnyMap properties) : VisionEncoder(models_map, config_dir_path, device, properties), - use_ov_image_preprocess(check_image_preprocess_env()) { - if (use_ov_image_preprocess) { + use_ov_vision_preprocess(check_vision_preprocess_env()) { + if (use_ov_vision_preprocess) { const auto& [vision_encoder_model, vision_encoder_weights] = utils::get_model_weights_pair(models_map, "vision_embeddings"); auto model_org = utils::singleton_core().read_model(vision_encoder_model, vision_encoder_weights); @@ -880,7 +880,7 @@ void VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vector& using EncodeFunc = std::function&, const ov::AnyMap&, ov::genai::EncodedVideo&, size_t, size_t)>; EncodeFunc encode_func; - if (use_ov_image_preprocess == false) { + if (use_ov_vision_preprocess == false) { encode_func = [this](const std::vector& image, const ov::AnyMap& config_map, ov::genai::EncodedVideo& encoded_video, size_t frm_num, size_t frm_id) { this->encode_with_imagepreprocess_cpp(image, config_map, encoded_video.video_features, encoded_video.resized_source_size, frm_num, frm_id); }; diff --git a/src/cpp/src/visual_language/qwen2vl/classes.hpp b/src/cpp/src/visual_language/qwen2vl/classes.hpp index 83adf54896..d45abdc6bd 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.hpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.hpp @@ -34,7 +34,7 @@ class VisionEncoderQwen2VL : public VisionEncoder { size_t frame_num = 1, size_t frame_id = 0); - bool use_ov_image_preprocess = true; // default use ov image preprocess, control by env IMAGE_PREPROCESS=CPP to use cpp image preprocess + bool use_ov_vision_preprocess = true; // default use ov vision preprocess, control by env VISION_PREPROCESS=CPP to use cpp vision preprocess }; class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder {