diff --git a/docs/docs.json b/docs/docs.json
index 2db3ae6..741aba5 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -209,6 +209,7 @@
"integrations/embedding/huggingface",
"integrations/embedding/aws",
"integrations/embedding/cohere",
+ "integrations/embedding/colpali",
"integrations/embedding/gemini",
"integrations/embedding/ibm",
"integrations/embedding/imagebind",
@@ -230,6 +231,7 @@
"integrations/reranking/cross_encoder",
"integrations/reranking/jina",
"integrations/reranking/linear_combination",
+ "integrations/reranking/mrr",
"integrations/reranking/openai",
"integrations/reranking/rrf",
"integrations/reranking/voyageai"
diff --git a/docs/integrations/embedding/colpali.mdx b/docs/integrations/embedding/colpali.mdx
new file mode 100644
index 0000000..dcd65a0
--- /dev/null
+++ b/docs/integrations/embedding/colpali.mdx
@@ -0,0 +1,53 @@
+---
+title: ColPali
+sidebarTitle: ColPali
+---
+
+import {
+ PyEmbeddingColpaliSetup,
+ PyEmbeddingColpaliTextSearch,
+} from '/snippets/integrations.mdx';
+
+We support [ColPali](https://github.com/illuin-tech/colpali) model embeddings for multimodal multi-vector retrieval. ColPali produces multiple embedding vectors per input (multi-vector), enabling more nuanced similarity matching between text queries and image documents.
+
+Using ColPali requires the colpali-engine package, which can be installed using `pip install colpali-engine`.
+
+
+ColPali produces **multi-vector** embeddings, meaning each input generates multiple embedding vectors rather than a single vector. Use `MultiVector(func.ndims())` instead of `Vector(func.ndims())` when defining your schema.
+
+
+Supported models are:
+
+- Metric-AI/ColQwen2.5-3b-multilingual-v1.0 (default)
+- vidore/colpali-v1.3
+- vidore/colqwen2-v1.0
+- vidore/colSmol-256M
+
+Supported parameters (to be passed in `create` method) are:
+
+| Parameter | Type | Default Value | Description |
+|---|---|---|---|
+| `model_name` | `str` | `"Metric-AI/ColQwen2.5-3b-multilingual-v1.0"` | The name of the model to use. |
+| `device` | `str` | `"auto"` | The device for inference. Can be `"auto"`, `"cpu"`, `"cuda"`, or `"mps"`. |
+| `dtype` | `str` | `"bfloat16"` | Data type for model weights (bfloat16, float16, float32, float64). |
+| `pooling_strategy` | `str` | `"hierarchical"` | Token pooling strategy: `"hierarchical"`, `"lambda"`, or `None`. |
+| `pool_factor` | `int` | `2` | Factor to reduce sequence length when pooling is enabled. |
+| `batch_size` | `int` | `2` | Batch size for processing inputs. |
+| `quantization_config` | `Optional[BitsAndBytesConfig]` | `None` | Quantization configuration for the model (requires bitsandbytes). |
+
+This embedding function supports ingesting images as both bytes and URLs. You can query them using text.
+
+
+
+ {PyEmbeddingColpaliSetup}
+
+
+
+Now we can search using text queries:
+
+
+
+ {PyEmbeddingColpaliTextSearch}
+
+
+
diff --git a/docs/integrations/reranking/answerdotai.mdx b/docs/integrations/reranking/answerdotai.mdx
index 6253466..7e6e982 100644
--- a/docs/integrations/reranking/answerdotai.mdx
+++ b/docs/integrations/reranking/answerdotai.mdx
@@ -27,7 +27,7 @@ Accepted Arguments
| `model_type` | `str` | `"colbert"` | The type of model to use. Supported model types can be found here: https://github.com/AnswerDotAI/rerankers. |
| `model_name` | `str` | `"answerdotai/answerai-colbert-small-v1"` | The name of the reranker model to use. |
| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. |
-| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. |
+| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. |
diff --git a/docs/integrations/reranking/cohere.mdx b/docs/integrations/reranking/cohere.mdx
index 0170c18..d0945f7 100644
--- a/docs/integrations/reranking/cohere.mdx
+++ b/docs/integrations/reranking/cohere.mdx
@@ -32,7 +32,7 @@ Accepted Arguments
| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. |
| `top_n` | `str` | `None` | The number of results to return. If None, will return all results. |
| `api_key` | `str` | `None` | The API key for the Cohere API. If not provided, the `COHERE_API_KEY` environment variable is used. |
-| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type |
+| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type |
diff --git a/docs/integrations/reranking/colbert.mdx b/docs/integrations/reranking/colbert.mdx
index 8b4ba05..c8533b8 100644
--- a/docs/integrations/reranking/colbert.mdx
+++ b/docs/integrations/reranking/colbert.mdx
@@ -26,7 +26,7 @@ Accepted Arguments
| `model_name` | `str` | `"colbert-ir/colbertv2.0"` | The name of the reranker model to use.|
| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. |
| `device` | `str` | `None` | The device to use for the cross encoder model. If None, will use "cuda" if available, otherwise "cpu". |
-| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. |
+| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. |
## Supported Scores for each query type
diff --git a/docs/integrations/reranking/cross_encoder.mdx b/docs/integrations/reranking/cross_encoder.mdx
index 61a271a..adeefaa 100644
--- a/docs/integrations/reranking/cross_encoder.mdx
+++ b/docs/integrations/reranking/cross_encoder.mdx
@@ -26,7 +26,7 @@ Accepted Arguments
| `model_name` | `str` | `""cross-encoder/ms-marco-TinyBERT-L-6"` | The name of the reranker model to use.|
| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. |
| `device` | `str` | `None` | The device to use for the cross encoder model. If None, will use "cuda" if available, otherwise "cpu". |
-| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. |
+| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. |
## Supported Scores for each query type
You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type:
diff --git a/docs/integrations/reranking/jina.mdx b/docs/integrations/reranking/jina.mdx
index cf5e902..f2eb83d 100644
--- a/docs/integrations/reranking/jina.mdx
+++ b/docs/integrations/reranking/jina.mdx
@@ -29,7 +29,7 @@ Accepted Arguments
| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. |
| `top_n` | `str` | `None` | The number of results to return. If None, will return all results. |
| `api_key` | `str` | `None` | The API key for the Jina API. If not provided, the `JINA_API_KEY` environment variable is used. |
-| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. |
+| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. |
diff --git a/docs/integrations/reranking/linear_combination.mdx b/docs/integrations/reranking/linear_combination.mdx
index 0587561..c521682 100644
--- a/docs/integrations/reranking/linear_combination.mdx
+++ b/docs/integrations/reranking/linear_combination.mdx
@@ -27,7 +27,7 @@ Accepted Arguments
| Argument | Type | Default | Description |
| --- | --- | --- | --- |
| `weight` | `float` | `0.7` | The weight to use for the semantic search score. The weight for the full-text search score is `1 - weights`. |
-| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all", will return all scores from the vector and FTS search along with the relevance score. |
+| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all", will return all scores from the vector and FTS search along with the relevance score. |
## Supported Scores for each query type
diff --git a/docs/integrations/reranking/mrr.mdx b/docs/integrations/reranking/mrr.mdx
new file mode 100644
index 0000000..2747ef9
--- /dev/null
+++ b/docs/integrations/reranking/mrr.mdx
@@ -0,0 +1,46 @@
+---
+title: MRR Reranker
+sidebarTitle: "MRR Algorithm"
+description: Combine and rerank search results using Mean Reciprocal Rank (MRR) algorithm in LanceDB. Supports weighted scoring for hybrid and multivector search.
+
+---
+
+import { PyRerankingMrrUsage } from '/snippets/integrations.mdx';
+
+# MRR Reranker
+
+This reranker uses the Mean Reciprocal Rank (MRR) algorithm to combine and rerank search results from vector and full-text search. You can use this reranker by passing `MRRReranker()` to the `rerank()` method. The MRR algorithm calculates the average of reciprocal ranks across different search results, providing a balanced way to merge results from multiple ranking systems.
+
+> **Note:** Supported query types – Hybrid and Multivector search.
+
+
+
+ {PyRerankingMrrUsage}
+
+
+
+Accepted Arguments
+----------------
+| Argument | Type | Default | Description |
+| --- | --- | --- | --- |
+| `weight_vector` | `float` | `0.5` | Weight for vector search results (0.0 to 1.0). |
+| `weight_fts` | `float` | `0.5` | Weight for FTS search results (0.0 to 1.0). |
+| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score`. If "all", will return all scores from the vector and FTS search along with the relevance score. |
+
+**Note:** `weight_vector` + `weight_fts` must equal 1.0.
+
+
+## Supported Scores for each query type
+You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type:
+
+### Hybrid Search
+|`return_score`| Status | Description |
+| --- | --- | --- |
+| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. |
+| `all` | ✅ Supported | Results have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`). |
+
+### Multivector Search
+|`return_score`| Status | Description |
+| --- | --- | --- |
+| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. |
+| `all` | ✅ Supported | Results have vector distances from all searches along with `_relevance_score`. |
diff --git a/docs/integrations/reranking/openai.mdx b/docs/integrations/reranking/openai.mdx
index b2d4db8..a68f1b9 100644
--- a/docs/integrations/reranking/openai.mdx
+++ b/docs/integrations/reranking/openai.mdx
@@ -26,8 +26,8 @@ Accepted Arguments
| --- | --- | --- | --- |
| `model_name` | `str` | `"gpt-4-turbo-preview"` | The name of the reranker model to use.|
| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. |
-| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. |
-| `api_key` | str | `None` | The API key to use. If None, will use the OPENAI_API_KEY environment variable.
+| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. |
+| `api_key` | `str` | `None` | The API key to use. If None, will use the OPENAI_API_KEY environment variable.
## Supported Scores for each query type
diff --git a/docs/integrations/reranking/rrf.mdx b/docs/integrations/reranking/rrf.mdx
index 0f8edbd..943eb78 100644
--- a/docs/integrations/reranking/rrf.mdx
+++ b/docs/integrations/reranking/rrf.mdx
@@ -26,7 +26,7 @@ Accepted Arguments
| Argument | Type | Default | Description |
| --- | --- | --- | --- |
| `K` | `int` | `60` | A constant used in the RRF formula (default is 60). Experiments indicate that k = 60 was near-optimal, but that the choice is not critical. |
-| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score`. If "all", will return all scores from the vector and FTS search along with the relevance score. |
+| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score`. If "all", will return all scores from the vector and FTS search along with the relevance score. |
## Supported Scores for each query type
diff --git a/docs/integrations/reranking/voyageai.mdx b/docs/integrations/reranking/voyageai.mdx
index 3a3c028..704d2c5 100644
--- a/docs/integrations/reranking/voyageai.mdx
+++ b/docs/integrations/reranking/voyageai.mdx
@@ -31,7 +31,7 @@ Accepted Arguments
| `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. |
| `top_n` | `str` | `None` | The number of results to return. If None, will return all results. |
| `api_key` | `str` | `None` | The API key for the Voyage AI API. If not provided, the `VOYAGE_API_KEY` environment variable is used. |
-| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type |
+| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type |
| `truncation` | `bool` | `None` | Whether to truncate the input to satisfy the "context length limit" on the query and the documents. |
diff --git a/docs/snippets/integrations.mdx b/docs/snippets/integrations.mdx
index 6b311a9..d30c979 100644
--- a/docs/snippets/integrations.mdx
+++ b/docs/snippets/integrations.mdx
@@ -1,9 +1,15 @@
{/* Auto-generated by scripts/mdx_snippets_gen.py. Do not edit manually. */}
+export const PyRerankingMrrMultivector = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import MRRReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n meta: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n meta_vector: Vector(embedder.ndims()) = embedder.VectorField(source_column=\"meta\")\n\ndata = [\n {\"text\": \"hello world\", \"meta\": \"greeting message\"},\n {\"text\": \"goodbye world\", \"meta\": \"farewell message\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\n\n# Search across multiple vector columns and collect results with row IDs\nquery = \"hello\"\nrs1 = tbl.search(query, vector_column_name=\"vector\").limit(10).with_row_id(True).to_arrow()\nrs2 = tbl.search(query, vector_column_name=\"meta_vector\").limit(10).with_row_id(True).to_arrow()\n\n# Rerank the combined results using MRR\nreranker = MRRReranker()\ncombined_results = reranker.rerank_multivector([rs1, rs2])\n";
+
export const PyEmbeddingAwsUsage = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nimport pandas as pd\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\n\nmodel = get_registry().get(\"bedrock-text\").create()\n\nclass TextModel(LanceModel):\n text: str = model.SourceField()\n vector: Vector(model.ndims()) = model.VectorField()\n\ndf = pd.DataFrame({\"text\": [\"hello world\", \"goodbye world\"]})\ndb = lancedb.connect(str(Path(tempfile.mkdtemp()) / \"bedrock-demo\"))\ntbl = db.create_table(\"test\", schema=TextModel, mode=\"overwrite\")\n\ntbl.add(df)\nrs = tbl.search(\"hello\").limit(1).to_pandas()\nprint(rs.head())\n";
export const PyEmbeddingCohereUsage = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nfrom lancedb.embeddings import EmbeddingFunctionRegistry\nfrom lancedb.pydantic import LanceModel, Vector\n\ncohere = (\n EmbeddingFunctionRegistry.get_instance()\n .get(\"cohere\")\n .create(name=\"embed-multilingual-v2.0\")\n)\n\nclass TextModel(LanceModel):\n text: str = cohere.SourceField()\n vector: Vector(cohere.ndims()) = cohere.VectorField()\n\ndata = [{\"text\": \"hello world\"}, {\"text\": \"goodbye world\"}]\n\ndb = lancedb.connect(str(Path(tempfile.mkdtemp()) / \"cohere-demo\"))\ntbl = db.create_table(\"test\", schema=TextModel, mode=\"overwrite\")\ntbl.add(data)\n";
+export const PyEmbeddingColpaliSetup = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nimport pandas as pd\nimport requests\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, MultiVector\n\ndb = lancedb.connect(str(Path(tempfile.mkdtemp()) / \"colpali-demo\"))\nfunc = get_registry().get(\"colpali\").create()\n\nclass Images(LanceModel):\n label: str\n image_uri: str = func.SourceField()\n image_bytes: bytes = func.SourceField()\n vector: MultiVector(func.ndims()) = func.VectorField()\n vec_from_bytes: MultiVector(func.ndims()) = func.VectorField()\n\ntable = db.create_table(\"images\", schema=Images)\nlabels = [\"cat\", \"dog\", \"horse\"]\nuris = [\n \"http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg\",\n \"http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg\",\n \"http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg\",\n]\nimage_bytes = [requests.get(uri).content for uri in uris]\ntable.add(\n pd.DataFrame({\"label\": labels, \"image_uri\": uris, \"image_bytes\": image_bytes})\n)\n";
+
+export const PyEmbeddingColpaliTextSearch = "actual = (\n table.search(\"a furry pet\", vector_column_name=\"vector\")\n .limit(1)\n .to_pydantic(Images)[0]\n)\nprint(actual.label)\n";
+
export const PyEmbeddingGeminiUsage = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nimport pandas as pd\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\n\nmodel = get_registry().get(\"gemini-text\").create()\n\nclass TextModel(LanceModel):\n text: str = model.SourceField()\n vector: Vector(model.ndims()) = model.VectorField()\n\ndf = pd.DataFrame({\"text\": [\"hello world\", \"goodbye world\"]})\ndb = lancedb.connect(str(Path(tempfile.mkdtemp()) / \"gemini-demo\"))\ntbl = db.create_table(\"test\", schema=TextModel, mode=\"overwrite\")\n\ntbl.add(df)\nrs = tbl.search(\"hello\").limit(1).to_pandas()\nprint(rs.head())\n";
export const PyEmbeddingHuggingfaceUsage = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nimport pandas as pd\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\n\ndb = lancedb.connect(str(Path(tempfile.mkdtemp()) / \"huggingface-demo\"))\nmodel = get_registry().get(\"huggingface\").create(name=\"facebook/bart-base\")\n\nclass Words(LanceModel):\n text: str = model.SourceField()\n vector: Vector(model.ndims()) = model.VectorField()\n\ndf = pd.DataFrame({\"text\": [\"hi hello sayonara\", \"goodbye world\"]})\ntable = db.create_table(\"greets\", schema=Words)\ntable.add(df)\nquery = \"old greeting\"\nactual = table.search(query).limit(1).to_pydantic(Words)[0]\nprint(actual.text)\n";
@@ -156,6 +162,8 @@ export const PyRerankingJinaUsage = "import os\n\nimport lancedb\nfrom lancedb.e
export const PyRerankingLinearCombinationUsage = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import LinearCombinationReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n\ndata = [\n {\"text\": \"hello world\"},\n {\"text\": \"goodbye world\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\nreranker = LinearCombinationReranker()\n\n# Run hybrid search with a reranker\ntbl.create_fts_index(\"text\", replace=True)\nresult = (\n tbl.search(\"hello\", query_type=\"hybrid\").rerank(reranker=reranker).to_list()\n)\n";
+export const PyRerankingMrrUsage = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import MRRReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n\ndata = [\n {\"text\": \"hello world\"},\n {\"text\": \"goodbye world\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\nreranker = MRRReranker(weight_vector=0.7, weight_fts=0.3)\n\n# Run hybrid search with a reranker\ntbl.create_fts_index(\"text\", replace=True)\nresult = (\n tbl.search(\"hello\", query_type=\"hybrid\").rerank(reranker=reranker).to_list()\n)\n\n# Run multivector search across multiple vector columns\nrs1 = tbl.search(\"hello\").limit(10).with_row_id(True).to_arrow()\nrs2 = tbl.search(\"greeting\").limit(10).with_row_id(True).to_arrow()\ncombined = MRRReranker().rerank_multivector([rs1, rs2])\n";
+
export const PyRerankingOpenaiUsage = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import OpenaiReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n\ndata = [\n {\"text\": \"hello world\"},\n {\"text\": \"goodbye world\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\nreranker = OpenaiReranker()\n\n# Run vector search with a reranker\nresult = tbl.search(\"hello\").rerank(reranker=reranker).to_list()\n\n# Run FTS search with a reranker\nresult = tbl.search(\"hello\", query_type=\"fts\").rerank(reranker=reranker).to_list()\n\n# Run hybrid search with a reranker\ntbl.create_fts_index(\"text\", replace=True)\nresult = (\n tbl.search(\"hello\", query_type=\"hybrid\").rerank(reranker=reranker).to_list()\n)\n";
export const PyRerankingRrfUsage = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import RRFReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n\ndata = [\n {\"text\": \"hello world\"},\n {\"text\": \"goodbye world\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\nreranker = RRFReranker()\n\n# Run hybrid search with a reranker\ntbl.create_fts_index(\"text\", replace=True)\nresult = (\n tbl.search(\"hello\", query_type=\"hybrid\").rerank(reranker=reranker).to_list()\n)\n";
diff --git a/tests/py/test_integrations.py b/tests/py/test_integrations.py
index fc78707..43f408e 100644
--- a/tests/py/test_integrations.py
+++ b/tests/py/test_integrations.py
@@ -262,6 +262,53 @@ class ImageBindModel(LanceModel):
# --8<-- [end:embedding_imagebind_text_search]
+def test_embedding_colpali_examples() -> None:
+ require_flag("RUN_COLPALI_SNIPPETS")
+ pytest.importorskip("colpali_engine")
+
+ # --8<-- [start:embedding_colpali_setup]
+ import tempfile
+ from pathlib import Path
+
+ import lancedb
+ import pandas as pd
+ import requests
+ from lancedb.embeddings import get_registry
+ from lancedb.pydantic import LanceModel, MultiVector
+
+ db = lancedb.connect(str(Path(tempfile.mkdtemp()) / "colpali-demo"))
+ func = get_registry().get("colpali").create()
+
+ class Images(LanceModel):
+ label: str
+ image_uri: str = func.SourceField()
+ image_bytes: bytes = func.SourceField()
+ vector: MultiVector(func.ndims()) = func.VectorField()
+ vec_from_bytes: MultiVector(func.ndims()) = func.VectorField()
+
+ table = db.create_table("images", schema=Images)
+ labels = ["cat", "dog", "horse"]
+ uris = [
+ "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg",
+ "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg",
+ "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg",
+ ]
+ image_bytes = [requests.get(uri).content for uri in uris]
+ table.add(
+ pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": image_bytes})
+ )
+ # --8<-- [end:embedding_colpali_setup]
+
+ # --8<-- [start:embedding_colpali_text_search]
+ actual = (
+ table.search("a furry pet", vector_column_name="vector")
+ .limit(1)
+ .to_pydantic(Images)[0]
+ )
+ print(actual.label)
+ # --8<-- [end:embedding_colpali_text_search]
+
+
def test_embedding_instructor_usage() -> None:
require_flag("RUN_INSTRUCTOR_SNIPPETS")
@@ -888,6 +935,43 @@ class Schema(LanceModel):
# --8<-- [end:reranking_rrf_usage]
+def test_reranking_mrr_usage() -> None:
+ require_flag("RUN_RERANKER_SNIPPETS")
+
+ # --8<-- [start:reranking_mrr_usage]
+ import lancedb
+ from lancedb.embeddings import get_registry
+ from lancedb.pydantic import LanceModel, Vector
+ from lancedb.rerankers import MRRReranker
+
+ embedder = get_registry().get("sentence-transformers").create()
+ db = lancedb.connect("~/.lancedb")
+
+ class Schema(LanceModel):
+ text: str = embedder.SourceField()
+ vector: Vector(embedder.ndims()) = embedder.VectorField()
+
+ data = [
+ {"text": "hello world"},
+ {"text": "goodbye world"},
+ ]
+ tbl = db.create_table("test", schema=Schema, mode="overwrite")
+ tbl.add(data)
+ reranker = MRRReranker(weight_vector=0.7, weight_fts=0.3)
+
+ # Run hybrid search with a reranker
+ tbl.create_fts_index("text", replace=True)
+ result = (
+ tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list()
+ )
+
+ # Run multivector search across multiple vector columns
+ rs1 = tbl.search("hello").limit(10).with_row_id(True).to_arrow()
+ rs2 = tbl.search("greeting").limit(10).with_row_id(True).to_arrow()
+ combined = MRRReranker().rerank_multivector([rs1, rs2])
+ # --8<-- [end:reranking_mrr_usage]
+
+
def test_reranking_voyageai_usage() -> None:
require_flag("RUN_RERANKER_SNIPPETS")
os.environ["VOYAGE_API_KEY"] = require_env("VOYAGE_API_KEY")