openvinotoolkit · przepeck · Dec 15, 2025 · Dec 17, 2025 · Dec 17, 2025 · Dec 17, 2025
diff --git a/demos/rerank/README.md b/demos/rerank/README.md
@@ -146,7 +146,7 @@ index 1, relevance_score 0.09138210117816925
 
 :::{dropdown} **Requesting rerank score with model that requires template applying on query and documents**
 
-tomaarsen/Qwen3-Reranker-0.6B-seq-cls is a copy of the Qwen3-Reranker-0.6B model (original model is not supported in OVMS) modified as a sequence classification model instead. It requires applying template on input, here is example client that does it:
+OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov is a copy of the Qwen3-Reranker-0.6B model (original model is not supported in OVMS) modified as a sequence classification model instead. It requires applying template on input, here is example client that does it:
 
 ```bash
 pip3 install requests
@@ -180,7 +180,7 @@ documents = [
 
 response = requests.post("http://127.0.0.1:8000/v3/rerank",
                          json={
-                             "model": "tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
+                             "model": "OpenVINO/Qwen3-Reranker-0.6B-seq-cls-fp16-ov",
                              "query": query,
                              "documents": documents,
                          }).json()
@@ -191,7 +191,7 @@ python rerank_client.py
 ```
 It will return response similar to:
 ```
-{'results': [{'index': 0, 'relevance_score': 0.024518223479390144}, {'index': 1, 'relevance_score': 0.0026006349362432957}]}
+{'results': [{'index': 0, 'relevance_score': 0.0216273982077837}, {'index': 1, 'relevance_score': 0.018804751336574554}]}
 ```
 :::
 
@@ -260,4 +260,35 @@ tomaarsen/Qwen3-Reranker-0.6B-seq-cls
 
 Check [RAG demo](../continuous_batching/rag/README.md) which employs `rerank` endpoint together with `chat/completions` and `embeddings`. 
 
+# Usage of tokenize endpoint (release 2026.0 or weekly)
+
+The `tokenize` endpoint provides a simple API for tokenizing input text using the same tokenizer as the deployed rerank model. This allows you to see how your text will be split into tokens before feature extraction or inference. The endpoint accepts a string or list of strings and returns the corresponding token IDs.
+
+Example usage:
+```console
+curl http://localhost:8000/v3/tokenize -H "Content-Type: application/json" -d "{ \"model\": \"OpenVINO/bge-reranker-base-int8-ov\", \"text\": \"hello world\" }"
+```
+Response:
+```json
+{
+  "tokens": [33600,31,8999]
+}
+```
+
+It's possible to use additional parameters:
+ - `pad_to_max_length` - whether to pad the sequence to the maximum length. Default is False. 
+ - `max_length` - maximum length of the sequence. If specified, it truncates the tokens to the provided number.
+ - `padding_side` - side to pad the sequence, can be `left` or `right`. Default is `right`.
+
+ Example usage:
+```console
+curl http://localhost:8000/v3/tokenize -H "Content-Type: application/json" -d "{ \"model\": \"OpenVINO/bge-reranker-base-int8-ov\", \"text\": \"hello world\", \"max_length\": 10, \"pad_to_max_length\": true, \"padding_side\": \"left\"}"
+```
+
+Response:
+```json
+{
+  "tokens": [1,1,1,1,1,1,1,33600,31,8999]
+}
+```
 
diff --git a/demos/rerank/compare_results.py b/demos/rerank/compare_results.py
@@ -25,7 +25,7 @@
 parser = argparse.ArgumentParser(description='Compare rerank responses from HF transformers OVMS')
 parser.add_argument('--base_url', required=False, default='http://localhost:8000/v3/',
                     help='Specify url to embeddings endpoint. default:http://localhost:8000/v3', dest='base_url')
-parser.add_argument('--model_name', default='BAAI/bge-reranker-large', help='Model name to query. default: Alibaba-NLP/gte-large-en-v1.5',
+parser.add_argument('--model_name', default='BAAI/bge-reranker-large', help='Model name to query. default: BAAI/bge-reranker-large',
                     dest='model_name')
 parser.add_argument('--query', default='', help='Query string to rerank.',
                     dest='query')

diff --git a/demos/rerank/config.json b/demos/rerank/config.json
diff --git a/demos/rerank/models/graph.pbtxt b/demos/rerank/models/graph.pbtxt
diff --git a/src/rerank/BUILD b/src/rerank/BUILD
@@ -80,6 +80,7 @@ ovms_cc_library(
         "//src:model_metric_reporter",
         "//src:executingstreamidguard",
         "//src:libovms_execution_context",
+        "//src/tokenize:tokenize_parser",
     ],
     visibility = ["//visibility:public"],
     alwayslink = 1,

diff --git a/src/rerank/rerank_calculator_ov.cc b/src/rerank/rerank_calculator_ov.cc
@@ -42,6 +42,7 @@
 #include "rerank_servable.hpp"
 #include "../model_metric_reporter.hpp"
 #include "../executingstreamidguard.hpp"
+#include "../tokenize/tokenize_parser.hpp"
 
 using namespace rapidjson;
 using namespace ovms;
@@ -289,6 +290,26 @@ class RerankCalculatorOV : public CalculatorBase {
         InputDataType payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get<InputDataType>();
         SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Request body: {}", payload.body);
         SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Request uri: {}", payload.uri);
+
+        if (TokenizeParser::isTokenizeEndpoint(payload.uri)) {
+            TokenizeRequest tokenizeRequest;
+            absl::Status status = TokenizeParser::parseTokenizeRequest(*payload.parsedJson, tokenizeRequest);
+            tokenizeRequest.parameters["add_special_tokens"] = false;  // Rerank model tokenizer should not add special tokens
+            if (!status.ok()) {
+                return status;
+            }
+            if (auto strings = std::get_if<std::vector<std::string>>(&tokenizeRequest.input)) {
+                auto tokens = rerank_session->getTokenizer().encode(*strings, tokenizeRequest.parameters);
+                StringBuffer buffer;
+                status = TokenizeParser::parseTokenizeResponse(buffer, tokens, tokenizeRequest.parameters);
+                cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(new std::string(buffer.GetString()), timestamp);
+                return absl::OkStatus();
+            } else {
+                SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Rerank tokenize input is of not supported type");
+                return absl::InvalidArgumentError("Input should be string or array of strings");
+            }
+        }
+
         RerankHandler handler(*payload.parsedJson);
         absl::Status status = handler.parseRequest();
         if (!status.ok()) {