@@ -119,31 +119,29 @@ std::shared_ptr<ov::Node> create_center_crop(std::shared_ptr<ov::Node> input, st
119119}
120120
121121// Helper function to calculate resize dimensions based on shortest edge
122- std::pair<int64_t , int64_t > calculate_resize_dimensions (
123- size_t orig_height,
124- size_t orig_width,
122+ ImageSize calculate_resize_dimensions (
123+ const ImageSize& original_size,
125124 int target_shortest_edge) {
126- float scale = static_cast <float >(target_shortest_edge) / std::min (orig_height, orig_width );
127- int64_t new_height = static_cast <int64_t >(orig_height * scale);
128- int64_t new_width = static_cast <int64_t >(orig_width * scale);
125+ float scale = static_cast <float >(target_shortest_edge) / std::min (original_size. height , original_size. width );
126+ size_t new_height = static_cast <size_t >(original_size. height * scale);
127+ size_t new_width = static_cast <size_t >(original_size. width * scale);
129128 return {new_height, new_width};
130129}
131130
132131// Helper function to set preprocessing parameters for integrated OV preprocessing model
133132void set_preprocess_parameters (
134133 ov::InferRequest& encoder,
135134 const ov::Tensor& input_frames,
136- size_t orig_height,
137- size_t orig_width,
135+ const ImageSize& original_size,
138136 const ProcessorConfig& config) {
139137
140138 // Calculate resize target size
141- auto [new_height, new_width] = calculate_resize_dimensions (orig_height, orig_width , config.size_shortest_edge );
139+ auto resized_size = calculate_resize_dimensions (original_size , config.size_shortest_edge );
142140
143141 // Set resize target size
144142 ov::Tensor target_size_tensor (ov::element::i64 , {2 });
145- target_size_tensor.data <int64_t >()[0 ] = new_height ;
146- target_size_tensor.data <int64_t >()[1 ] = new_width ;
143+ target_size_tensor.data <int64_t >()[0 ] = resized_size. height ;
144+ target_size_tensor.data <int64_t >()[1 ] = resized_size. width ;
147145
148146 // Set crop size
149147 ov::Tensor crop_size_tensor (ov::element::i64 , {2 });
@@ -155,7 +153,7 @@ void set_preprocess_parameters(
155153 encoder.set_input_tensor (2 , crop_size_tensor);
156154}
157155
158- bool can_use_ov_preprocess () {
156+ bool can_use_ov_vision_preprocess () {
159157 const char * env = std::getenv (" VISION_PREPROCESS" );
160158 return !(env && std::string (env) == " CPP" );
161159}
@@ -225,8 +223,8 @@ std::pair<size_t, size_t> get_unpadded_features(size_t height, size_t width, siz
225223clip_image_f32 preprocess_clip_image_llava_next_video (const clip_image_u8& image, ProcessorConfig& config) {
226224 // Resize
227225 clip_image_u8 resized_image;
228- auto [new_height, new_width] = calculate_resize_dimensions (image.ny , image.nx , config.size_shortest_edge );
229- bicubic_resize (image, resized_image, static_cast <int >(new_width ), static_cast <int >(new_height ));
226+ auto resized_size = calculate_resize_dimensions ({ static_cast < size_t >( image.ny ), static_cast < size_t >( image.nx )} , config.size_shortest_edge );
227+ bicubic_resize (image, resized_image, static_cast <int >(resized_size. width ), static_cast <int >(resized_size. height ));
230228
231229 // Center crop
232230 clip_image_u8 cropped_image = center_crop (resized_image, config.crop_size_height , config.crop_size_width );
@@ -248,8 +246,8 @@ VisionEncoderLLaVANextVideo::VisionEncoderLLaVANextVideo(
248246 const std::filesystem::path& model_dir,
249247 const std::string& device,
250248 const ov::AnyMap properties) : VisionEncoderLLaVANext(model_dir, device, properties),
251- use_ov_preprocess (can_use_ov_preprocess ()) {
252- if (use_ov_preprocess ) {
249+ use_ov_vision_preprocess (can_use_ov_vision_preprocess ()) {
250+ if (use_ov_vision_preprocess ) {
253251 // Create integrated preprocessing + vision encoder model for image/video processing
254252 auto vision_encoder_model = utils::singleton_core ().read_model (model_dir / " openvino_vision_embeddings_model.xml" );
255253 auto model = patch_preprocess_into_vision_encoder_model (vision_encoder_model, m_processor_config);
@@ -283,8 +281,8 @@ VisionEncoderLLaVANextVideo::VisionEncoderLLaVANextVideo(
283281 const std::filesystem::path& config_dir_path,
284282 const std::string& device,
285283 const ov::AnyMap device_config) : VisionEncoderLLaVANext{models_map, config_dir_path, device, device_config},
286- use_ov_preprocess (can_use_ov_preprocess ()) {
287- if (use_ov_preprocess ) {
284+ use_ov_vision_preprocess (can_use_ov_vision_preprocess ()) {
285+ if (use_ov_vision_preprocess ) {
288286 // Create integrated preprocessing + vision encoder model for image/video processing
289287 const auto & [vision_encoder_model, vision_encoder_weights] = utils::get_model_weights_pair (models_map, " vision_embeddings" );
290288 auto vision_encoder_model_original = utils::singleton_core ().read_model (vision_encoder_model, vision_encoder_weights);
@@ -330,7 +328,7 @@ EncodedImage VisionEncoderLLaVANextVideo::encode(const ov::Tensor& image, const
330328 ProcessorConfig config = utils::from_any_map (config_map, m_processor_config);
331329
332330 ov::Shape pixel_values_shape;
333- if (use_ov_preprocess ) {
331+ if (use_ov_vision_preprocess ) {
334332 // Use integrated OV preprocessing model with batch processing similar to get_pixel_values_llava_next
335333 clip_image_u8 input_image = tensor_to_clip_image_u8 (image);
336334
@@ -356,7 +354,7 @@ EncodedImage VisionEncoderLLaVANextVideo::encode(const ov::Tensor& image, const
356354 }
357355
358356 // Set inputs for integrated preprocessing model
359- set_preprocess_parameters (encoder, concatenated_patches, patch_height, patch_width, config);
357+ set_preprocess_parameters (encoder, concatenated_patches, { patch_height, patch_width} , config);
360358
361359 // Set pixel_values_shape for later use
362360 pixel_values_shape = {num_patches, 3 , static_cast <size_t >(config.crop_size_height ), static_cast <size_t >(config.crop_size_width )};
@@ -505,7 +503,7 @@ ov::Tensor InputsEmbedderLLaVANextVideo::get_inputs_embeds(
505503 return text_embeds;
506504}
507505
508- std::pair< ov::Tensor, size_t > VisionEncoderLLaVANextVideo::preprocess_frames_cpp (const std::vector<ov::Tensor>& frames) {
506+ ov::Tensor VisionEncoderLLaVANextVideo::preprocess_frames_cpp (const std::vector<ov::Tensor>& frames) {
509507 ProcessorConfig config = get_processor_config ();
510508 size_t num_frames = frames.size ();
511509 std::vector<ov::Tensor> preprocessed_frames;
@@ -518,13 +516,8 @@ std::pair<ov::Tensor, size_t> VisionEncoderLLaVANextVideo::preprocess_frames_cpp
518516 preprocessed_frames.push_back (clip_image_f32_to_tensor (preprocessed));
519517 }
520518
521- // Calculate number of video tokens
522- const ov::Shape& first_shape = preprocessed_frames[0 ].get_shape ();
523- size_t height = first_shape[2 ];
524- size_t width = first_shape[3 ];
525- size_t num_video_tokens = ((height / m_patch_size) * (width / m_patch_size) / 4 ) * num_frames;
526-
527519 // Concatenate preprocessed frames to single tensor
520+ const ov::Shape& first_shape = preprocessed_frames[0 ].get_shape ();
528521 ov::Shape concat_shape = first_shape;
529522 concat_shape[0 ] = num_frames;
530523 ov::Tensor concatenated_frames (preprocessed_frames[0 ].get_element_type (), concat_shape);
@@ -536,32 +529,7 @@ std::pair<ov::Tensor, size_t> VisionEncoderLLaVANextVideo::preprocess_frames_cpp
536529 frames_data += ov::shape_size (first_shape);
537530 }
538531
539- return {concatenated_frames, num_video_tokens};
540- }
541-
542- std::pair<ov::Tensor, size_t > VisionEncoderLLaVANextVideo::preprocess_frames_ov (const std::vector<ov::Tensor>& frames) {
543- // Preprocessing is integrated into the encoder model
544- // Just concatenate input frames (NHWC uint8 format)
545- ProcessorConfig config = get_processor_config ();
546- size_t num_frames = frames.size ();
547-
548- const ov::Shape& first_shape = frames[0 ].get_shape ();
549- ov::Shape concat_shape = first_shape;
550- concat_shape[0 ] = num_frames;
551- ov::Tensor concatenated_frames (frames[0 ].get_element_type (), concat_shape);
552-
553- size_t frame_byte_size = frames[0 ].get_byte_size ();
554- uint8_t * frames_data = concatenated_frames.data <uint8_t >();
555- for (size_t i = 0 ; i < num_frames; i++) {
556- std::memcpy (frames_data, frames[i].data (), frame_byte_size);
557- frames_data += ov::shape_size (first_shape);
558- }
559-
560- // Calculate num_video_tokens
561- size_t num_video_tokens = ((config.crop_size_height / m_patch_size) *
562- (config.crop_size_width / m_patch_size) / 4 ) * num_frames;
563-
564- return {concatenated_frames, num_video_tokens};
532+ return concatenated_frames;
565533}
566534
567535std::vector<ov::genai::EncodedVideo> InputsEmbedderLLaVANextVideo::encode_videos (const std::vector<ov::Tensor>& videos) {
@@ -571,26 +539,27 @@ std::vector<ov::genai::EncodedVideo> InputsEmbedderLLaVANextVideo::encode_videos
571539 std::vector<ov::genai::EncodedVideo> encoded_videos;
572540 for (const auto video: videos) {
573541 std::vector<ov::Tensor> frames = to_single_image_tensors ({video});
542+ size_t num_frames = frames.size ();
574543
575- // Use OV or CPU preprocessing based on configuration
576- auto [concatenated_frames, num_video_tokens] = vision_encoder->get_use_ov_preprocess ()
577- ? vision_encoder->preprocess_frames_ov (frames)
578- : vision_encoder->preprocess_frames_cpp (frames);
544+ // Calculate num_video_tokens (same for both OV and CPU preprocessing)
545+ size_t num_video_tokens = ((config.crop_size_height / vision_encoder->get_patch_size ()) *
546+ (config.crop_size_width / vision_encoder->get_patch_size ()) / 4 ) * num_frames;
579547
580548 // infer video feature extraction models
581549 CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard (vision_encoder->get_vision_encoder ());
582550 ov::InferRequest& encoder = infer_request_guard.get ();
583551
584- if (vision_encoder->get_use_ov_preprocess ()) {
585- // Use integrated OV preprocessing model
552+ if (vision_encoder->get_use_ov_vision_preprocess ()) {
553+ // Use integrated OV preprocessing model - pass video tensor directly
586554 auto frame_shape = frames[0 ].get_shape ();
587555 size_t orig_height = frame_shape[1 ];
588556 size_t orig_width = frame_shape[2 ];
589557
590558 // Set inputs for integrated model
591- set_preprocess_parameters (encoder, concatenated_frames, orig_height, orig_width, config);
559+ set_preprocess_parameters (encoder, video, { orig_height, orig_width} , config);
592560 } else {
593- // Use normal encoder (preprocessing already done in preprocess_frames_cpp)
561+ // Use CPU preprocessing - preprocess and concatenate frames
562+ ov::Tensor concatenated_frames = vision_encoder->preprocess_frames_cpp (frames);
594563 encoder.set_tensor (" pixel_values" , concatenated_frames);
595564 }
596565
0 commit comments