diff --git a/demos/rerank/README.md b/demos/rerank/README.md index 04f4ae55eb..189c3cc9e8 100644 --- a/demos/rerank/README.md +++ b/demos/rerank/README.md @@ -146,7 +146,7 @@ index 1, relevance_score 0.09138210117816925 :::{dropdown} **Requesting rerank score with model that requires template applying on query and documents** -tomaarsen/Qwen3-Reranker-0.6B-seq-cls is a copy of the Qwen3-Reranker-0.6B model (original model is not supported in OVMS) modified as a sequence classification model instead. It requires applying template on input, here is example client that does it: +OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov is a copy of the Qwen3-Reranker-0.6B model (original model is not supported in OVMS) modified as a sequence classification model instead. It requires applying template on input, here is example client that does it: ```bash pip3 install requests @@ -180,7 +180,7 @@ documents = [ response = requests.post("http://127.0.0.1:8000/v3/rerank", json={ - "model": "tomaarsen/Qwen3-Reranker-0.6B-seq-cls", + "model": "OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov", "query": query, "documents": documents, }).json() @@ -191,7 +191,7 @@ python rerank_client.py ``` It will return response similar to: ``` -{'results': [{'index': 0, 'relevance_score': 0.024518223479390144}, {'index': 1, 'relevance_score': 0.0026006349362432957}]} +{'results': [{'index': 0, 'relevance_score': 0.0216273982077837}, {'index': 1, 'relevance_score': 0.018804751336574554}]} ``` ::: @@ -260,4 +260,35 @@ tomaarsen/Qwen3-Reranker-0.6B-seq-cls Check [RAG demo](../continuous_batching/rag/README.md) which employs `rerank` endpoint together with `chat/completions` and `embeddings`. +# Usage of tokenize endpoint (release 2026.0 or weekly) + +The `tokenize` endpoint provides a simple API for tokenizing input text using the same tokenizer as the deployed rerank model. This allows you to see how your text will be split into tokens before feature extraction or inference. The endpoint accepts a string or list of strings and returns the corresponding token IDs. + +Example usage: +```console +curl http://localhost:8000/v3/tokenize -H "Content-Type: application/json" -d "{ \"model\": \"OpenVINO/bge-reranker-base-int8-ov\", \"text\": \"hello world\" }" +``` +Response: +```json +{ + "tokens": [33600,31,8999] +} +``` + +It's possible to use additional parameters: + - `pad_to_max_length` - whether to pad the sequence to the maximum length. Default is False. + - `max_length` - maximum length of the sequence. If specified, it truncates the tokens to the provided number. + - `padding_side` - side to pad the sequence, can be `left` or `right`. Default is `right`. + + Example usage: +```console +curl http://localhost:8000/v3/tokenize -H "Content-Type: application/json" -d "{ \"model\": \"OpenVINO/bge-reranker-base-int8-ov\", \"text\": \"hello world\", \"max_length\": 10, \"pad_to_max_length\": true, \"padding_side\": \"left\"}" +``` + +Response: +```json +{ + "tokens": [1,1,1,1,1,1,1,33600,31,8999] +} +``` diff --git a/demos/rerank/compare_results.py b/demos/rerank/compare_results.py index 22a5ba54dd..4f0e682a72 100644 --- a/demos/rerank/compare_results.py +++ b/demos/rerank/compare_results.py @@ -25,7 +25,7 @@ parser = argparse.ArgumentParser(description='Compare rerank responses from HF transformers OVMS') parser.add_argument('--base_url', required=False, default='http://localhost:8000/v3/', help='Specify url to embeddings endpoint. default:http://localhost:8000/v3', dest='base_url') -parser.add_argument('--model_name', default='BAAI/bge-reranker-large', help='Model name to query. default: Alibaba-NLP/gte-large-en-v1.5', +parser.add_argument('--model_name', default='BAAI/bge-reranker-large', help='Model name to query. default: BAAI/bge-reranker-large', dest='model_name') parser.add_argument('--query', default='', help='Query string to rerank.', dest='query') diff --git a/demos/rerank/config.json b/demos/rerank/config.json deleted file mode 100644 index 6510ff1f1e..0000000000 --- a/demos/rerank/config.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "model_config_list": [ - { - "config": { - "name": "tokenizer", - "base_path": "/workspace/models/BAAI/bge-reranker-large-tokenizer" - } - }, - { - "config": { - "name": "rerank_model", - "base_path": "/workspace/models/BAAI/bge-reranker-large-rerank" - } - } - ], - "mediapipe_config_list": [ - { - "name": "rerank", - "graph_path": "/workspace/models/graph.pbtxt" - } - ] -} diff --git a/demos/rerank/models/graph.pbtxt b/demos/rerank/models/graph.pbtxt deleted file mode 100644 index d68fa2c511..0000000000 --- a/demos/rerank/models/graph.pbtxt +++ /dev/null @@ -1,29 +0,0 @@ -input_stream: "REQUEST_PAYLOAD:input" -output_stream: "RESPONSE_PAYLOAD:output" -node { - calculator: "OpenVINOModelServerSessionCalculator" - output_side_packet: "SESSION:tokenizer" - node_options: { - [type.googleapis.com / mediapipe.OpenVINOModelServerSessionCalculatorOptions]: { - servable_name: "tokenizer" - servable_version: "1" - } - } -} -node { - calculator: "OpenVINOModelServerSessionCalculator" - output_side_packet: "SESSION:rerank" - node_options: { - [type.googleapis.com / mediapipe.OpenVINOModelServerSessionCalculatorOptions]: { - servable_name: "rerank_model" - servable_version: "1" - } - } -} -node { - input_side_packet: "TOKENIZER_SESSION:tokenizer" - input_side_packet: "RERANK_SESSION:rerank" - calculator: "RerankCalculator" - input_stream: "REQUEST_PAYLOAD:input" - output_stream: "RESPONSE_PAYLOAD:output" -} diff --git a/src/rerank/BUILD b/src/rerank/BUILD index 9d3f88605a..7f3b1a6ec9 100644 --- a/src/rerank/BUILD +++ b/src/rerank/BUILD @@ -80,6 +80,7 @@ ovms_cc_library( "//src:model_metric_reporter", "//src:executingstreamidguard", "//src:libovms_execution_context", + "//src/tokenize:tokenize_parser", ], visibility = ["//visibility:public"], alwayslink = 1, diff --git a/src/rerank/rerank_calculator_ov.cc b/src/rerank/rerank_calculator_ov.cc index 5cefcaddce..48036362e8 100644 --- a/src/rerank/rerank_calculator_ov.cc +++ b/src/rerank/rerank_calculator_ov.cc @@ -42,6 +42,7 @@ #include "rerank_servable.hpp" #include "../model_metric_reporter.hpp" #include "../executingstreamidguard.hpp" +#include "../tokenize/tokenize_parser.hpp" using namespace rapidjson; using namespace ovms; @@ -289,6 +290,26 @@ class RerankCalculatorOV : public CalculatorBase { InputDataType payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get(); SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Request body: {}", payload.body); SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Request uri: {}", payload.uri); + + if (TokenizeParser::isTokenizeEndpoint(payload.uri)) { + TokenizeRequest tokenizeRequest; + absl::Status status = TokenizeParser::parseTokenizeRequest(*payload.parsedJson, tokenizeRequest); + tokenizeRequest.parameters["add_special_tokens"] = false; // Rerank model tokenizer should not add special tokens + if (!status.ok()) { + return status; + } + if (auto strings = std::get_if>(&tokenizeRequest.input)) { + auto tokens = rerank_session->getTokenizer().encode(*strings, tokenizeRequest.parameters); + StringBuffer buffer; + status = TokenizeParser::parseTokenizeResponse(buffer, tokens, tokenizeRequest.parameters); + cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(new std::string(buffer.GetString()), timestamp); + return absl::OkStatus(); + } else { + SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Rerank tokenize input is of not supported type"); + return absl::InvalidArgumentError("Input should be string or array of strings"); + } + } + RerankHandler handler(*payload.parsedJson); absl::Status status = handler.parseRequest(); if (!status.ok()) { diff --git a/src/test/reranknode_test.cpp b/src/test/reranknode_test.cpp index 89cdba5422..e3f8dea020 100644 --- a/src/test/reranknode_test.cpp +++ b/src/test/reranknode_test.cpp @@ -383,3 +383,242 @@ INSTANTIATE_TEST_SUITE_P( RerankWithInvalidParamsHttpTestInstances, RerankWithInvalidParamsHttpTest, graphs); + +class RerankTokenizeHttpTest : public V3HttpTest { +protected: + static std::unique_ptr t; + +public: + const std::string endpointTokenize = "/v3/tokenize"; + static void SetUpTestSuite() { + std::string port = "9173"; + std::string configPath = getGenericFullPathForSrcTest("/ovms/src/test/rerank/config.json"); + SetUpSuite(port, configPath, t); + } + + static void TearDownTestSuite() { + TearDownSuite(t); + } + + static void AssertTokenizationResult(const std::string& response, const std::vector& expectedTokens) { + rapidjson::Document d; + rapidjson::ParseResult ok = d.Parse(response.c_str()); + ASSERT_EQ(ok.Code(), 0); + ASSERT_TRUE(d.HasMember("tokens")); + ASSERT_TRUE(d["tokens"].IsArray()); + ASSERT_EQ(d["tokens"].Size(), expectedTokens.size()); + for (size_t i = 0; i < expectedTokens.size(); ++i) { + ASSERT_EQ(d["tokens"][(rapidjson::SizeType)i].GetInt(), expectedTokens[i]); + } + } + + static void AssertTokenizationResult(const std::string& response, const std::vector>& expectedTokensBatch) { + rapidjson::Document d; + rapidjson::ParseResult ok = d.Parse(response.c_str()); + ASSERT_EQ(ok.Code(), 0); + ASSERT_TRUE(d.HasMember("tokens")); + ASSERT_TRUE(d["tokens"].IsArray()); + ASSERT_EQ(d["tokens"].Size(), expectedTokensBatch.size()); + for (size_t i = 0; i < expectedTokensBatch.size(); ++i) { + const auto& expectedTokens = expectedTokensBatch[i]; + ASSERT_TRUE(d["tokens"][(rapidjson::SizeType)i].IsArray()); + ASSERT_EQ(d["tokens"][(rapidjson::SizeType)i].Size(), expectedTokens.size()); + for (size_t j = 0; j < expectedTokens.size(); ++j) { + ASSERT_EQ(d["tokens"][(rapidjson::SizeType)i][(rapidjson::SizeType)j].GetInt(), expectedTokens[j]); + } + } + } +}; + +std::unique_ptr RerankTokenizeHttpTest::t; + +TEST_F(RerankTokenizeHttpTest, tokenizePositive) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world" + } + )"; + std::vector expectedTokens = {33600, 31, 8999}; + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizeNegativeMissingText) { + std::string requestBody = R"( + { + "model": "rerank_ov" + } + )"; + Status status = handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser); + ASSERT_EQ(status, ovms::StatusCode::MEDIAPIPE_EXECUTION_ERROR) << status.string(); +} + +TEST_F(RerankTokenizeHttpTest, tokenizeNegativeInvalidModel) { + std::string requestBody = R"( + { + "model": "non_existing_model", + "text": "hello world" + } + )"; + Status status = handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser); + ASSERT_EQ(status, ovms::StatusCode::MEDIAPIPE_DEFINITION_NAME_MISSING) << status.string(); +} + +TEST_F(RerankTokenizeHttpTest, tokenizePositiveMaxLenParam) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world hello world", + "max_length": 3 + } + )"; + std::vector expectedTokens = {33600, 31, 8999}; + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizePositivePadToMaxLenParam) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world", + "max_length": 100, + "pad_to_max_length": true + } + )"; + std::vector expectedTokens(97, 1); + expectedTokens.insert(expectedTokens.begin(), {33600, 31, 8999}); + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizePositivePaddingSideLeft) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world", + "max_length": 100, + "pad_to_max_length": true, + "padding_side": "left" + } + )"; + std::vector expectedTokens(97, 1); + expectedTokens.insert(expectedTokens.end(), {33600, 31, 8999}); + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizePositivePaddingSideRight) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world", + "max_length": 100, + "pad_to_max_length": true, + "padding_side": "right" + } + )"; + std::vector expectedTokens(97, 1); + expectedTokens.insert(expectedTokens.begin(), {33600, 31, 8999}); + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizeNegativeInvalidPaddingSide) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world", + "padding_side": "invalid_value" + } + )"; + Status status = handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser); + ASSERT_EQ(status, ovms::StatusCode::MEDIAPIPE_EXECUTION_ERROR) << status.string(); +} + +TEST_F(RerankTokenizeHttpTest, tokenizePositiveMaxLengthIgnored) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world", + "max_length": 513, + "pad_to_max_length": true + } + )"; + std::vector expectedTokens(510, 1); + expectedTokens.insert(expectedTokens.begin(), {33600, 31, 8999}); + ASSERT_EQ(handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizePositiveBatch) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": ["hello", "hello world", "hello hello hello world"] + } + )"; + Status status = handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser); + std::vector> expectedTokens = { + {33600, 31}, + {33600, 31, 8999}, + {33600, 31, 33600, 31, 33600, 31, 8999}}; + rapidjson::Document d; + rapidjson::ParseResult ok = d.Parse(response.c_str()); + ASSERT_EQ(ok.Code(), 0); + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizeBatchWithPadToMaxLen) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": ["hello", "hello world", "hello hello hello world"], + "max_length": 6, + "pad_to_max_length": true + } + )"; + Status status = handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser); + std::vector> expectedTokens = { + {33600, 31, 1, 1, 1, 1}, + {33600, 31, 8999, 1, 1, 1}, + {33600, 31, 33600, 31, 33600, 31}}; + rapidjson::Document d; + rapidjson::ParseResult ok = d.Parse(response.c_str()); + ASSERT_EQ(ok.Code(), 0); + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +} + +TEST_F(RerankTokenizeHttpTest, tokenizeIgnooreAddSpecialTokensParameter) { + std::string requestBody = R"( + { + "model": "rerank_ov", + "text": "hello world", + "max_length": 3, + "add_special_tokens": true + } + )"; + std::vector expectedTokens = {33600, 31, 8999}; + ASSERT_EQ( + handler->dispatchToProcessor(endpointTokenize, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + AssertTokenizationResult(response, expectedTokens); +}