Skip to content

Commit c6dcf71

Browse files
committed
Apply comments
1 parent f6332df commit c6dcf71

File tree

6 files changed

+55
-84
lines changed

6 files changed

+55
-84
lines changed

src/cpp/src/visual_language/llava_next_video/classes.cpp

Lines changed: 30 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -119,31 +119,29 @@ std::shared_ptr<ov::Node> create_center_crop(std::shared_ptr<ov::Node> input, st
119119
}
120120

121121
// Helper function to calculate resize dimensions based on shortest edge
122-
std::pair<int64_t, int64_t> calculate_resize_dimensions(
123-
size_t orig_height,
124-
size_t orig_width,
122+
ImageSize calculate_resize_dimensions(
123+
const ImageSize& original_size,
125124
int target_shortest_edge) {
126-
float scale = static_cast<float>(target_shortest_edge) / std::min(orig_height, orig_width);
127-
int64_t new_height = static_cast<int64_t>(orig_height * scale);
128-
int64_t new_width = static_cast<int64_t>(orig_width * scale);
125+
float scale = static_cast<float>(target_shortest_edge) / std::min(original_size.height, original_size.width);
126+
size_t new_height = static_cast<size_t>(original_size.height * scale);
127+
size_t new_width = static_cast<size_t>(original_size.width * scale);
129128
return {new_height, new_width};
130129
}
131130

132131
// Helper function to set preprocessing parameters for integrated OV preprocessing model
133132
void set_preprocess_parameters(
134133
ov::InferRequest& encoder,
135134
const ov::Tensor& input_frames,
136-
size_t orig_height,
137-
size_t orig_width,
135+
const ImageSize& original_size,
138136
const ProcessorConfig& config) {
139137

140138
// Calculate resize target size
141-
auto [new_height, new_width] = calculate_resize_dimensions(orig_height, orig_width, config.size_shortest_edge);
139+
auto resized_size = calculate_resize_dimensions(original_size, config.size_shortest_edge);
142140

143141
// Set resize target size
144142
ov::Tensor target_size_tensor(ov::element::i64, {2});
145-
target_size_tensor.data<int64_t>()[0] = new_height;
146-
target_size_tensor.data<int64_t>()[1] = new_width;
143+
target_size_tensor.data<int64_t>()[0] = resized_size.height;
144+
target_size_tensor.data<int64_t>()[1] = resized_size.width;
147145

148146
// Set crop size
149147
ov::Tensor crop_size_tensor(ov::element::i64, {2});
@@ -155,7 +153,7 @@ void set_preprocess_parameters(
155153
encoder.set_input_tensor(2, crop_size_tensor);
156154
}
157155

158-
bool can_use_ov_preprocess() {
156+
bool can_use_ov_vision_preprocess() {
159157
const char* env = std::getenv("VISION_PREPROCESS");
160158
return !(env && std::string(env) == "CPP");
161159
}
@@ -225,8 +223,8 @@ std::pair<size_t, size_t> get_unpadded_features(size_t height, size_t width, siz
225223
clip_image_f32 preprocess_clip_image_llava_next_video(const clip_image_u8& image, ProcessorConfig& config) {
226224
// Resize
227225
clip_image_u8 resized_image;
228-
auto [new_height, new_width] = calculate_resize_dimensions(image.ny, image.nx, config.size_shortest_edge);
229-
bicubic_resize(image, resized_image, static_cast<int>(new_width), static_cast<int>(new_height));
226+
auto resized_size = calculate_resize_dimensions({static_cast<size_t>(image.ny), static_cast<size_t>(image.nx)}, config.size_shortest_edge);
227+
bicubic_resize(image, resized_image, static_cast<int>(resized_size.width), static_cast<int>(resized_size.height));
230228

231229
// Center crop
232230
clip_image_u8 cropped_image = center_crop(resized_image, config.crop_size_height, config.crop_size_width);
@@ -248,8 +246,8 @@ VisionEncoderLLaVANextVideo::VisionEncoderLLaVANextVideo(
248246
const std::filesystem::path& model_dir,
249247
const std::string& device,
250248
const ov::AnyMap properties) : VisionEncoderLLaVANext(model_dir, device, properties),
251-
use_ov_preprocess(can_use_ov_preprocess()) {
252-
if (use_ov_preprocess) {
249+
use_ov_vision_preprocess(can_use_ov_vision_preprocess()) {
250+
if (use_ov_vision_preprocess) {
253251
// Create integrated preprocessing + vision encoder model for image/video processing
254252
auto vision_encoder_model = utils::singleton_core().read_model(model_dir / "openvino_vision_embeddings_model.xml");
255253
auto model = patch_preprocess_into_vision_encoder_model(vision_encoder_model, m_processor_config);
@@ -283,8 +281,8 @@ VisionEncoderLLaVANextVideo::VisionEncoderLLaVANextVideo(
283281
const std::filesystem::path& config_dir_path,
284282
const std::string& device,
285283
const ov::AnyMap device_config) : VisionEncoderLLaVANext{models_map, config_dir_path, device, device_config},
286-
use_ov_preprocess(can_use_ov_preprocess()) {
287-
if (use_ov_preprocess) {
284+
use_ov_vision_preprocess(can_use_ov_vision_preprocess()) {
285+
if (use_ov_vision_preprocess) {
288286
// Create integrated preprocessing + vision encoder model for image/video processing
289287
const auto& [vision_encoder_model, vision_encoder_weights] = utils::get_model_weights_pair(models_map, "vision_embeddings");
290288
auto vision_encoder_model_original = utils::singleton_core().read_model(vision_encoder_model, vision_encoder_weights);
@@ -330,7 +328,7 @@ EncodedImage VisionEncoderLLaVANextVideo::encode(const ov::Tensor& image, const
330328
ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
331329

332330
ov::Shape pixel_values_shape;
333-
if (use_ov_preprocess) {
331+
if (use_ov_vision_preprocess) {
334332
// Use integrated OV preprocessing model with batch processing similar to get_pixel_values_llava_next
335333
clip_image_u8 input_image = tensor_to_clip_image_u8(image);
336334

@@ -356,7 +354,7 @@ EncodedImage VisionEncoderLLaVANextVideo::encode(const ov::Tensor& image, const
356354
}
357355

358356
// Set inputs for integrated preprocessing model
359-
set_preprocess_parameters(encoder, concatenated_patches, patch_height, patch_width, config);
357+
set_preprocess_parameters(encoder, concatenated_patches, {patch_height, patch_width}, config);
360358

361359
// Set pixel_values_shape for later use
362360
pixel_values_shape = {num_patches, 3, static_cast<size_t>(config.crop_size_height), static_cast<size_t>(config.crop_size_width)};
@@ -505,7 +503,7 @@ ov::Tensor InputsEmbedderLLaVANextVideo::get_inputs_embeds(
505503
return text_embeds;
506504
}
507505

508-
std::pair<ov::Tensor, size_t> VisionEncoderLLaVANextVideo::preprocess_frames_cpp(const std::vector<ov::Tensor>& frames) {
506+
ov::Tensor VisionEncoderLLaVANextVideo::preprocess_frames_cpp(const std::vector<ov::Tensor>& frames) {
509507
ProcessorConfig config = get_processor_config();
510508
size_t num_frames = frames.size();
511509
std::vector<ov::Tensor> preprocessed_frames;
@@ -518,13 +516,8 @@ std::pair<ov::Tensor, size_t> VisionEncoderLLaVANextVideo::preprocess_frames_cpp
518516
preprocessed_frames.push_back(clip_image_f32_to_tensor(preprocessed));
519517
}
520518

521-
// Calculate number of video tokens
522-
const ov::Shape& first_shape = preprocessed_frames[0].get_shape();
523-
size_t height = first_shape[2];
524-
size_t width = first_shape[3];
525-
size_t num_video_tokens = ((height / m_patch_size) * (width / m_patch_size) / 4) * num_frames;
526-
527519
// Concatenate preprocessed frames to single tensor
520+
const ov::Shape& first_shape = preprocessed_frames[0].get_shape();
528521
ov::Shape concat_shape = first_shape;
529522
concat_shape[0] = num_frames;
530523
ov::Tensor concatenated_frames(preprocessed_frames[0].get_element_type(), concat_shape);
@@ -536,32 +529,7 @@ std::pair<ov::Tensor, size_t> VisionEncoderLLaVANextVideo::preprocess_frames_cpp
536529
frames_data += ov::shape_size(first_shape);
537530
}
538531

539-
return {concatenated_frames, num_video_tokens};
540-
}
541-
542-
std::pair<ov::Tensor, size_t> VisionEncoderLLaVANextVideo::preprocess_frames_ov(const std::vector<ov::Tensor>& frames) {
543-
// Preprocessing is integrated into the encoder model
544-
// Just concatenate input frames (NHWC uint8 format)
545-
ProcessorConfig config = get_processor_config();
546-
size_t num_frames = frames.size();
547-
548-
const ov::Shape& first_shape = frames[0].get_shape();
549-
ov::Shape concat_shape = first_shape;
550-
concat_shape[0] = num_frames;
551-
ov::Tensor concatenated_frames(frames[0].get_element_type(), concat_shape);
552-
553-
size_t frame_byte_size = frames[0].get_byte_size();
554-
uint8_t* frames_data = concatenated_frames.data<uint8_t>();
555-
for (size_t i = 0; i < num_frames; i++) {
556-
std::memcpy(frames_data, frames[i].data(), frame_byte_size);
557-
frames_data += ov::shape_size(first_shape);
558-
}
559-
560-
// Calculate num_video_tokens
561-
size_t num_video_tokens = ((config.crop_size_height / m_patch_size) *
562-
(config.crop_size_width / m_patch_size) / 4) * num_frames;
563-
564-
return {concatenated_frames, num_video_tokens};
532+
return concatenated_frames;
565533
}
566534

567535
std::vector<ov::genai::EncodedVideo> InputsEmbedderLLaVANextVideo::encode_videos(const std::vector<ov::Tensor>& videos) {
@@ -571,26 +539,27 @@ std::vector<ov::genai::EncodedVideo> InputsEmbedderLLaVANextVideo::encode_videos
571539
std::vector<ov::genai::EncodedVideo> encoded_videos;
572540
for (const auto video: videos) {
573541
std::vector<ov::Tensor> frames = to_single_image_tensors({video});
542+
size_t num_frames = frames.size();
574543

575-
// Use OV or CPU preprocessing based on configuration
576-
auto [concatenated_frames, num_video_tokens] = vision_encoder->get_use_ov_preprocess()
577-
? vision_encoder->preprocess_frames_ov(frames)
578-
: vision_encoder->preprocess_frames_cpp(frames);
544+
// Calculate num_video_tokens (same for both OV and CPU preprocessing)
545+
size_t num_video_tokens = ((config.crop_size_height / vision_encoder->get_patch_size()) *
546+
(config.crop_size_width / vision_encoder->get_patch_size()) / 4) * num_frames;
579547

580548
// infer video feature extraction models
581549
CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(vision_encoder->get_vision_encoder());
582550
ov::InferRequest& encoder = infer_request_guard.get();
583551

584-
if (vision_encoder->get_use_ov_preprocess()) {
585-
// Use integrated OV preprocessing model
552+
if (vision_encoder->get_use_ov_vision_preprocess()) {
553+
// Use integrated OV preprocessing model - pass video tensor directly
586554
auto frame_shape = frames[0].get_shape();
587555
size_t orig_height = frame_shape[1];
588556
size_t orig_width = frame_shape[2];
589557

590558
// Set inputs for integrated model
591-
set_preprocess_parameters(encoder, concatenated_frames, orig_height, orig_width, config);
559+
set_preprocess_parameters(encoder, video, {orig_height, orig_width}, config);
592560
} else {
593-
// Use normal encoder (preprocessing already done in preprocess_frames_cpp)
561+
// Use CPU preprocessing - preprocess and concatenate frames
562+
ov::Tensor concatenated_frames = vision_encoder->preprocess_frames_cpp(frames);
594563
encoder.set_tensor("pixel_values", concatenated_frames);
595564
}
596565

src/cpp/src/visual_language/llava_next_video/classes.hpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,7 @@ class VisionEncoderLLaVANextVideo : public VisionEncoderLLaVANext {
1515

1616
EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override;
1717

18-
std::pair<ov::Tensor, size_t> preprocess_frames_cpp(const std::vector<ov::Tensor>& frames);
19-
20-
std::pair<ov::Tensor, size_t> preprocess_frames_ov(const std::vector<ov::Tensor>& frames);
18+
ov::Tensor preprocess_frames_cpp(const std::vector<ov::Tensor>& frames);
2119

2220
VisionEncoderLLaVANextVideo(const std::filesystem::path& model_dir, const std::string& device, const ov::AnyMap properties);
2321

@@ -38,15 +36,19 @@ class VisionEncoderLLaVANextVideo : public VisionEncoderLLaVANext {
3836
return m_ireq_queue_vision_resampler.get();
3937
}
4038

41-
bool get_use_ov_preprocess() const {
42-
return use_ov_preprocess;
39+
bool get_use_ov_vision_preprocess() const {
40+
return use_ov_vision_preprocess;
41+
}
42+
43+
size_t get_patch_size() const {
44+
return m_patch_size;
4345
}
4446

4547
private:
4648
std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_multi_modal_projector;
4749
std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_vision_resampler;
4850
size_t m_patch_size;
49-
bool use_ov_preprocess = true;
51+
bool use_ov_vision_preprocess = true; // default use ov vision preprocessing, control by env VISION_PREPROCESS=CPP to use CPU vision preprocessing
5052
};
5153

5254
class InputsEmbedderLLaVANextVideo : public InputsEmbedderLLaVANext {

src/cpp/src/visual_language/phi3_vision/classes.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -888,7 +888,7 @@ EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMa
888888

889889
ImageSize image_size;
890890

891-
if (use_ov_image_preprocess) {
891+
if (use_ov_vision_preprocess) {
892892
ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops);
893893
image_size = ImageSize{hd_image.get_shape().at(2), hd_image.get_shape().at(1)};
894894

@@ -921,17 +921,17 @@ EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMa
921921
return encoded_image;
922922
}
923923

924-
bool can_use_ov_image_preprocess() {
925-
const char* env = std::getenv("IMAGE_PREPROCESS");
924+
bool can_use_ov_vision_preprocess() {
925+
const char* env = std::getenv("VISION_PREPROCESS");
926926
return !(env && std::string(env) == "CPP");
927927
}
928928

929929
VisionEncoderPhi3V::VisionEncoderPhi3V(const std::filesystem::path& model_dir,
930930
const std::string& device,
931931
const ov::AnyMap properties)
932932
: VisionEncoder(model_dir, device, properties),
933-
use_ov_image_preprocess(can_use_ov_image_preprocess()) {
934-
if (use_ov_image_preprocess) {
933+
use_ov_vision_preprocess(can_use_ov_vision_preprocess()) {
934+
if (use_ov_vision_preprocess) {
935935
auto vision_encoder_model = utils::singleton_core().read_model(model_dir / "openvino_vision_embeddings_model.xml");
936936
auto model = patch_image_preprocess_into_vision_encoder_model(vision_encoder_model, m_processor_config);
937937
auto compiled_model = utils::singleton_core().compile_model(model, device, properties);
@@ -963,8 +963,8 @@ VisionEncoderPhi3V::VisionEncoderPhi3V(const ModelsMap& models_map,
963963
const std::string& device,
964964
const ov::AnyMap properties)
965965
: VisionEncoder(models_map, config_dir_path, device, properties),
966-
use_ov_image_preprocess(can_use_ov_image_preprocess()) {
967-
if (use_ov_image_preprocess) {
966+
use_ov_vision_preprocess(can_use_ov_vision_preprocess()) {
967+
if (use_ov_vision_preprocess) {
968968
const auto& [vision_encoder_model, vision_encoder_weights] = utils::get_model_weights_pair(models_map, "vision_embeddings");
969969
auto model_org = utils::singleton_core().read_model(vision_encoder_model, vision_encoder_weights);
970970
auto model = patch_image_preprocess_into_vision_encoder_model(model_org, m_processor_config);

src/cpp/src/visual_language/phi3_vision/classes.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class VisionEncoderPhi3V : public VisionEncoder {
4242
EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override;
4343

4444
private:
45-
bool use_ov_image_preprocess = true; // default use ov image preprocessing, control by env IMAGE_PREPROCESS=CPP to use CPU image preprocessing
45+
bool use_ov_vision_preprocess = true; // default use ov vision preprocessing, control by env VISION_PREPROCESS=CPP to use CPU vision preprocessing
4646

4747
};
4848

src/cpp/src/visual_language/qwen2vl/classes.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -649,17 +649,17 @@ std::unique_ptr<CircularBufferQueue<ov::InferRequest>> create_vision_encoder_ire
649649
});
650650
}
651651

652-
bool check_image_preprocess_env() {
653-
const char* env = std::getenv("IMAGE_PREPROCESS");
652+
bool check_vision_preprocess_env() {
653+
const char* env = std::getenv("VISION_PREPROCESS");
654654
return !(env && std::string(env) == "CPP");
655655
}
656656

657657
VisionEncoderQwen2VL::VisionEncoderQwen2VL(const std::filesystem::path& model_dir,
658658
const std::string& device,
659659
const ov::AnyMap properties)
660660
: VisionEncoder(model_dir, device, properties),
661-
use_ov_image_preprocess(check_image_preprocess_env()) {
662-
if (use_ov_image_preprocess) {
661+
use_ov_vision_preprocess(check_vision_preprocess_env()) {
662+
if (use_ov_vision_preprocess) {
663663
auto model_org = utils::singleton_core().read_model(model_dir / "openvino_vision_embeddings_model.xml");
664664
m_ireq_queue_vision_encoder = create_vision_encoder_ireq(model_org, m_processor_config, device, properties);
665665
}
@@ -670,8 +670,8 @@ VisionEncoderQwen2VL::VisionEncoderQwen2VL(const ModelsMap& models_map,
670670
const std::string& device,
671671
const ov::AnyMap properties)
672672
: VisionEncoder(models_map, config_dir_path, device, properties),
673-
use_ov_image_preprocess(check_image_preprocess_env()) {
674-
if (use_ov_image_preprocess) {
673+
use_ov_vision_preprocess(check_vision_preprocess_env()) {
674+
if (use_ov_vision_preprocess) {
675675
const auto& [vision_encoder_model, vision_encoder_weights] =
676676
utils::get_model_weights_pair(models_map, "vision_embeddings");
677677
auto model_org = utils::singleton_core().read_model(vision_encoder_model, vision_encoder_weights);
@@ -880,7 +880,7 @@ void VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vector<ov::
880880

881881
EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
882882
EncodedImage encoded_img;
883-
if (use_ov_image_preprocess == false) {
883+
if (use_ov_vision_preprocess == false) {
884884
encode_with_imagepreprocess_cpp({image}, config_map, encoded_img.resized_source, encoded_img.resized_source_size);
885885
return encoded_img;
886886
}
@@ -899,7 +899,7 @@ EncodedVideo VisionEncoderQwen2VL::encode_frames(const std::vector<ov::Tensor>&
899899

900900
using EncodeFunc = std::function<void(const std::vector<ov::Tensor>&, const ov::AnyMap&, ov::genai::EncodedVideo&, size_t, size_t)>;
901901
EncodeFunc encode_func;
902-
if (use_ov_image_preprocess == false) {
902+
if (use_ov_vision_preprocess == false) {
903903
encode_func = [this](const std::vector<ov::Tensor>& image, const ov::AnyMap& config_map, ov::genai::EncodedVideo& encoded_video, size_t frm_num, size_t frm_id) {
904904
this->encode_with_imagepreprocess_cpp(image, config_map, encoded_video.video_features, encoded_video.resized_source_size, frm_num, frm_id);
905905
};

src/cpp/src/visual_language/qwen2vl/classes.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class VisionEncoderQwen2VL : public VisionEncoder {
3434
size_t frame_num = 1,
3535
size_t frame_id = 0);
3636

37-
bool use_ov_image_preprocess = true; // default use ov image preprocess, control by env IMAGE_PREPROCESS=CPP to use cpp image preprocess
37+
bool use_ov_vision_preprocess = true; // default use ov vision preprocess, control by env VISION_PREPROCESS=CPP to use cpp vision preprocess
3838
};
3939

4040
class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder {

0 commit comments

Comments
 (0)