Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
367 changes: 316 additions & 51 deletions src/cpp/src/visual_language/llava_next_video/classes.cpp
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test is missing. That's something we didn't think about earlier, so you can skip it in this PR. Would be nice if you added it as a separate PR. There are tests on preresized images. The new test should verify that the images that don't trigger a resize give the same result with and without VISION_PREPROCESS

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you mind if I handle this in a separate PR?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure

Large diffs are not rendered by default.

11 changes: 10 additions & 1 deletion src/cpp/src/visual_language/llava_next_video/classes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class VisionEncoderLLaVANextVideo : public VisionEncoderLLaVANext {

EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override;

std::pair<std::vector<ov::Tensor>, size_t> preprocess_frames(const std::vector<ov::Tensor>& frames);
ov::Tensor preprocess_frames_cpp(const std::vector<ov::Tensor>& frames);

VisionEncoderLLaVANextVideo(const std::filesystem::path& model_dir, const std::string& device, const ov::AnyMap properties);

Expand All @@ -36,10 +36,19 @@ class VisionEncoderLLaVANextVideo : public VisionEncoderLLaVANext {
return m_ireq_queue_vision_resampler.get();
}

bool get_use_ov_vision_preprocess() const {
return use_ov_vision_preprocess;
}

size_t get_patch_size() const {
return m_patch_size;
}

private:
std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_multi_modal_projector;
std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_vision_resampler;
size_t m_patch_size;
bool use_ov_vision_preprocess = true; // default use ov vision preprocessing, control by env VISION_PREPROCESS=CPP to use CPU vision preprocessing
};

class InputsEmbedderLLaVANextVideo : public InputsEmbedderLLaVANext {
Expand Down
14 changes: 7 additions & 7 deletions src/cpp/src/visual_language/phi3_vision/classes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -888,7 +888,7 @@ EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMa

ImageSize image_size;

if (use_ov_image_preprocess) {
if (use_ov_vision_preprocess) {
ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops);
image_size = ImageSize{hd_image.get_shape().at(2), hd_image.get_shape().at(1)};

Expand Down Expand Up @@ -921,17 +921,17 @@ EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMa
return encoded_image;
}

bool can_use_ov_image_preprocess() {
const char* env = std::getenv("IMAGE_PREPROCESS");
bool can_use_ov_vision_preprocess() {
const char* env = std::getenv("VISION_PREPROCESS");
return !(env && std::string(env) == "CPP");
}

VisionEncoderPhi3V::VisionEncoderPhi3V(const std::filesystem::path& model_dir,
const std::string& device,
const ov::AnyMap properties)
: VisionEncoder(model_dir, device, properties),
use_ov_image_preprocess(can_use_ov_image_preprocess()) {
if (use_ov_image_preprocess) {
use_ov_vision_preprocess(can_use_ov_vision_preprocess()) {
if (use_ov_vision_preprocess) {
auto vision_encoder_model = utils::singleton_core().read_model(model_dir / "openvino_vision_embeddings_model.xml");
auto model = patch_image_preprocess_into_vision_encoder_model(vision_encoder_model, m_processor_config);
auto compiled_model = utils::singleton_core().compile_model(model, device, properties);
Expand Down Expand Up @@ -963,8 +963,8 @@ VisionEncoderPhi3V::VisionEncoderPhi3V(const ModelsMap& models_map,
const std::string& device,
const ov::AnyMap properties)
: VisionEncoder(models_map, config_dir_path, device, properties),
use_ov_image_preprocess(can_use_ov_image_preprocess()) {
if (use_ov_image_preprocess) {
use_ov_vision_preprocess(can_use_ov_vision_preprocess()) {
if (use_ov_vision_preprocess) {
const auto& [vision_encoder_model, vision_encoder_weights] = utils::get_model_weights_pair(models_map, "vision_embeddings");
auto model_org = utils::singleton_core().read_model(vision_encoder_model, vision_encoder_weights);
auto model = patch_image_preprocess_into_vision_encoder_model(model_org, m_processor_config);
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/visual_language/phi3_vision/classes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class VisionEncoderPhi3V : public VisionEncoder {
EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override;

private:
bool use_ov_image_preprocess = true; // default use ov image preprocessing, control by env IMAGE_PREPROCESS=CPP to use CPU image preprocessing
bool use_ov_vision_preprocess = true; // default use ov vision preprocessing, control by env VISION_PREPROCESS=CPP to use CPU vision preprocessing

};

Expand Down
16 changes: 8 additions & 8 deletions src/cpp/src/visual_language/qwen2vl/classes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -649,17 +649,17 @@ std::unique_ptr<CircularBufferQueue<ov::InferRequest>> create_vision_encoder_ire
});
}

bool check_image_preprocess_env() {
const char* env = std::getenv("IMAGE_PREPROCESS");
bool check_vision_preprocess_env() {
const char* env = std::getenv("VISION_PREPROCESS");
return !(env && std::string(env) == "CPP");
}

VisionEncoderQwen2VL::VisionEncoderQwen2VL(const std::filesystem::path& model_dir,
const std::string& device,
const ov::AnyMap properties)
: VisionEncoder(model_dir, device, properties),
use_ov_image_preprocess(check_image_preprocess_env()) {
if (use_ov_image_preprocess) {
use_ov_vision_preprocess(check_vision_preprocess_env()) {
if (use_ov_vision_preprocess) {
auto model_org = utils::singleton_core().read_model(model_dir / "openvino_vision_embeddings_model.xml");
m_ireq_queue_vision_encoder = create_vision_encoder_ireq(model_org, m_processor_config, device, properties);
}
Expand All @@ -670,8 +670,8 @@ VisionEncoderQwen2VL::VisionEncoderQwen2VL(const ModelsMap& models_map,
const std::string& device,
const ov::AnyMap properties)
: VisionEncoder(models_map, config_dir_path, device, properties),
use_ov_image_preprocess(check_image_preprocess_env()) {
if (use_ov_image_preprocess) {
use_ov_vision_preprocess(check_vision_preprocess_env()) {
if (use_ov_vision_preprocess) {
const auto& [vision_encoder_model, vision_encoder_weights] =
utils::get_model_weights_pair(models_map, "vision_embeddings");
auto model_org = utils::singleton_core().read_model(vision_encoder_model, vision_encoder_weights);
Expand Down Expand Up @@ -880,7 +880,7 @@ void VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vector<ov::

EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
EncodedImage encoded_img;
if (use_ov_image_preprocess == false) {
if (use_ov_vision_preprocess == false) {
encode_with_imagepreprocess_cpp({image}, config_map, encoded_img.resized_source, encoded_img.resized_source_size);
return encoded_img;
}
Expand All @@ -899,7 +899,7 @@ EncodedVideo VisionEncoderQwen2VL::encode_frames(const std::vector<ov::Tensor>&

using EncodeFunc = std::function<void(const std::vector<ov::Tensor>&, const ov::AnyMap&, ov::genai::EncodedVideo&, size_t, size_t)>;
EncodeFunc encode_func;
if (use_ov_image_preprocess == false) {
if (use_ov_vision_preprocess == false) {
encode_func = [this](const std::vector<ov::Tensor>& image, const ov::AnyMap& config_map, ov::genai::EncodedVideo& encoded_video, size_t frm_num, size_t frm_id) {
this->encode_with_imagepreprocess_cpp(image, config_map, encoded_video.video_features, encoded_video.resized_source_size, frm_num, frm_id);
};
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/visual_language/qwen2vl/classes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class VisionEncoderQwen2VL : public VisionEncoder {
size_t frame_num = 1,
size_t frame_id = 0);

bool use_ov_image_preprocess = true; // default use ov image preprocess, control by env IMAGE_PREPROCESS=CPP to use cpp image preprocess
bool use_ov_vision_preprocess = true; // default use ov vision preprocess, control by env VISION_PREPROCESS=CPP to use cpp vision preprocess
};

class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder {
Expand Down
Loading