fix: vllm rerank endpoint and upgrade the documentation (#609)

ffais · web-flow · commit 6f43af468464 · 2025-10-15T11:18:24.000+02:00
Fixes #608 Signed-off-by: ffais <ffais@fbk.eu>
diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@
 IMG ?= controller:latest
 # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary.
 ENVTEST_K8S_VERSION = 1.33.0
-CRD_REF_DOCS_VERSION = v0.1.0
+CRD_REF_DOCS_VERSION = v0.2.0
 SKAFFOLD_VERSION = v2.13.2
 
 # Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set)
diff --git a/docs/how-to/configure-reranking-models.md b/docs/how-to/configure-reranking-models.md
@@ -31,7 +31,7 @@ Once the pod is ready, you can call the rerank endpoint:
 ```python
 import requests
 resp = requests.post(
-    "http://localhost:8000/vllm/v1/rerank",
+    "http://localhost:8000/openai/v1/rerank",
     json={
         "model": "bge-rerank-base-cpu",
         "query": "Which document talks about apples?",
diff --git a/docs/reference/kubernetes-api.md b/docs/reference/kubernetes-api.md
@@ -128,7 +128,7 @@ _Appears in:_
 
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `url` _string_ | URL of the model to be served.<br />Currently the following formats are supported:<br /><br />For VLLM, FasterWhisper, Infinity engines:<br /><br />"hf://<repo>/<model>"<br />"pvc://<pvcName>"<br />"pvc://<pvcName>/<pvcSubpath>"<br />"gs://<bucket>/<path>" (only with cacheProfile)<br />"oss://<bucket>/<path>" (only with cacheProfile)<br />"s3://<bucket>/<path>" (only with cacheProfile)<br /><br />For OLlama engine:<br /><br />"ollama://<model>" |  | Required: \{\} <br /> |
+| `url` _string_ | URL of the model to be served.<br />Currently the following formats are supported:<br />For VLLM, FasterWhisper, Infinity engines:<br />"hf://<repo>/<model>"<br />"pvc://<pvcName>"<br />"pvc://<pvcName>/<pvcSubpath>"<br />"gs://<bucket>/<path>" (only with cacheProfile)<br />"oss://<bucket>/<path>" (only with cacheProfile)<br />"s3://<bucket>/<path>" (only with cacheProfile)<br />For OLlama engine:<br />"ollama://<model>" |  | Required: \{\} <br /> |
 | `adapters` _[Adapter](#adapter) array_ |  |  |  |
 | `features` _[ModelFeature](#modelfeature) array_ | Features that the model supports.<br />Dictates the APIs that are available for the model. |  | Enum: [TextGeneration TextEmbedding Reranking SpeechToText] <br /> |
 | `engine` _string_ | Engine to be used for the server process. |  | Enum: [OLlama VLLM FasterWhisper Infinity] <br />Required: \{\} <br /> |
diff --git a/internal/openaiserver/handler.go b/internal/openaiserver/handler.go
@@ -38,7 +38,7 @@ func NewHandler(k8sClient client.Client, modelProxy *modelproxy.Handler) *Handle
 	handle("/openai/v1/chat/completions", http.StripPrefix("/openai", modelProxy))
 	handle("/openai/v1/completions", http.StripPrefix("/openai", modelProxy))
 	handle("/openai/v1/embeddings", http.StripPrefix("/openai", modelProxy))
-	handle("/vllm/v1/rerank", http.StripPrefix("/vllm", modelProxy))
+	handle("/openai/v1/rerank", http.StripPrefix("/openai", modelProxy))
 	handle("/openai/v1/audio/transcriptions", http.StripPrefix("/openai", modelProxy))
 	handle("/openai/v1/models", http.HandlerFunc(h.getModels))