diff --git a/docs/docs.json b/docs/docs.json index 2db3ae6..741aba5 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -209,6 +209,7 @@ "integrations/embedding/huggingface", "integrations/embedding/aws", "integrations/embedding/cohere", + "integrations/embedding/colpali", "integrations/embedding/gemini", "integrations/embedding/ibm", "integrations/embedding/imagebind", @@ -230,6 +231,7 @@ "integrations/reranking/cross_encoder", "integrations/reranking/jina", "integrations/reranking/linear_combination", + "integrations/reranking/mrr", "integrations/reranking/openai", "integrations/reranking/rrf", "integrations/reranking/voyageai" diff --git a/docs/integrations/embedding/colpali.mdx b/docs/integrations/embedding/colpali.mdx new file mode 100644 index 0000000..dcd65a0 --- /dev/null +++ b/docs/integrations/embedding/colpali.mdx @@ -0,0 +1,53 @@ +--- +title: ColPali +sidebarTitle: ColPali +--- + +import { + PyEmbeddingColpaliSetup, + PyEmbeddingColpaliTextSearch, +} from '/snippets/integrations.mdx'; + +We support [ColPali](https://github.com/illuin-tech/colpali) model embeddings for multimodal multi-vector retrieval. ColPali produces multiple embedding vectors per input (multi-vector), enabling more nuanced similarity matching between text queries and image documents. + +Using ColPali requires the colpali-engine package, which can be installed using `pip install colpali-engine`. + + +ColPali produces **multi-vector** embeddings, meaning each input generates multiple embedding vectors rather than a single vector. Use `MultiVector(func.ndims())` instead of `Vector(func.ndims())` when defining your schema. + + +Supported models are: + +- Metric-AI/ColQwen2.5-3b-multilingual-v1.0 (default) +- vidore/colpali-v1.3 +- vidore/colqwen2-v1.0 +- vidore/colSmol-256M + +Supported parameters (to be passed in `create` method) are: + +| Parameter | Type | Default Value | Description | +|---|---|---|---| +| `model_name` | `str` | `"Metric-AI/ColQwen2.5-3b-multilingual-v1.0"` | The name of the model to use. | +| `device` | `str` | `"auto"` | The device for inference. Can be `"auto"`, `"cpu"`, `"cuda"`, or `"mps"`. | +| `dtype` | `str` | `"bfloat16"` | Data type for model weights (bfloat16, float16, float32, float64). | +| `pooling_strategy` | `str` | `"hierarchical"` | Token pooling strategy: `"hierarchical"`, `"lambda"`, or `None`. | +| `pool_factor` | `int` | `2` | Factor to reduce sequence length when pooling is enabled. | +| `batch_size` | `int` | `2` | Batch size for processing inputs. | +| `quantization_config` | `Optional[BitsAndBytesConfig]` | `None` | Quantization configuration for the model (requires bitsandbytes). | + +This embedding function supports ingesting images as both bytes and URLs. You can query them using text. + + + + {PyEmbeddingColpaliSetup} + + + +Now we can search using text queries: + + + + {PyEmbeddingColpaliTextSearch} + + + diff --git a/docs/integrations/reranking/answerdotai.mdx b/docs/integrations/reranking/answerdotai.mdx index 6253466..7e6e982 100644 --- a/docs/integrations/reranking/answerdotai.mdx +++ b/docs/integrations/reranking/answerdotai.mdx @@ -27,7 +27,7 @@ Accepted Arguments | `model_type` | `str` | `"colbert"` | The type of model to use. Supported model types can be found here: https://github.com/AnswerDotAI/rerankers. | | `model_name` | `str` | `"answerdotai/answerai-colbert-small-v1"` | The name of the reranker model to use. | | `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. | +| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. | diff --git a/docs/integrations/reranking/cohere.mdx b/docs/integrations/reranking/cohere.mdx index 0170c18..d0945f7 100644 --- a/docs/integrations/reranking/cohere.mdx +++ b/docs/integrations/reranking/cohere.mdx @@ -32,7 +32,7 @@ Accepted Arguments | `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | | `top_n` | `str` | `None` | The number of results to return. If None, will return all results. | | `api_key` | `str` | `None` | The API key for the Cohere API. If not provided, the `COHERE_API_KEY` environment variable is used. | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type | +| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type | diff --git a/docs/integrations/reranking/colbert.mdx b/docs/integrations/reranking/colbert.mdx index 8b4ba05..c8533b8 100644 --- a/docs/integrations/reranking/colbert.mdx +++ b/docs/integrations/reranking/colbert.mdx @@ -26,7 +26,7 @@ Accepted Arguments | `model_name` | `str` | `"colbert-ir/colbertv2.0"` | The name of the reranker model to use.| | `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | | `device` | `str` | `None` | The device to use for the cross encoder model. If None, will use "cuda" if available, otherwise "cpu". | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. | +| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. | ## Supported Scores for each query type diff --git a/docs/integrations/reranking/cross_encoder.mdx b/docs/integrations/reranking/cross_encoder.mdx index 61a271a..adeefaa 100644 --- a/docs/integrations/reranking/cross_encoder.mdx +++ b/docs/integrations/reranking/cross_encoder.mdx @@ -26,7 +26,7 @@ Accepted Arguments | `model_name` | `str` | `""cross-encoder/ms-marco-TinyBERT-L-6"` | The name of the reranker model to use.| | `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | | `device` | `str` | `None` | The device to use for the cross encoder model. If None, will use "cuda" if available, otherwise "cpu". | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. | +| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. | ## Supported Scores for each query type You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type: diff --git a/docs/integrations/reranking/jina.mdx b/docs/integrations/reranking/jina.mdx index cf5e902..f2eb83d 100644 --- a/docs/integrations/reranking/jina.mdx +++ b/docs/integrations/reranking/jina.mdx @@ -29,7 +29,7 @@ Accepted Arguments | `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | | `top_n` | `str` | `None` | The number of results to return. If None, will return all results. | | `api_key` | `str` | `None` | The API key for the Jina API. If not provided, the `JINA_API_KEY` environment variable is used. | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. | +| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. | diff --git a/docs/integrations/reranking/linear_combination.mdx b/docs/integrations/reranking/linear_combination.mdx index 0587561..c521682 100644 --- a/docs/integrations/reranking/linear_combination.mdx +++ b/docs/integrations/reranking/linear_combination.mdx @@ -27,7 +27,7 @@ Accepted Arguments | Argument | Type | Default | Description | | --- | --- | --- | --- | | `weight` | `float` | `0.7` | The weight to use for the semantic search score. The weight for the full-text search score is `1 - weights`. | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all", will return all scores from the vector and FTS search along with the relevance score. | +| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all", will return all scores from the vector and FTS search along with the relevance score. | ## Supported Scores for each query type diff --git a/docs/integrations/reranking/mrr.mdx b/docs/integrations/reranking/mrr.mdx new file mode 100644 index 0000000..2747ef9 --- /dev/null +++ b/docs/integrations/reranking/mrr.mdx @@ -0,0 +1,46 @@ +--- +title: MRR Reranker +sidebarTitle: "MRR Algorithm" +description: Combine and rerank search results using Mean Reciprocal Rank (MRR) algorithm in LanceDB. Supports weighted scoring for hybrid and multivector search. + +--- + +import { PyRerankingMrrUsage } from '/snippets/integrations.mdx'; + +# MRR Reranker + +This reranker uses the Mean Reciprocal Rank (MRR) algorithm to combine and rerank search results from vector and full-text search. You can use this reranker by passing `MRRReranker()` to the `rerank()` method. The MRR algorithm calculates the average of reciprocal ranks across different search results, providing a balanced way to merge results from multiple ranking systems. + +> **Note:** Supported query types – Hybrid and Multivector search. + + + + {PyRerankingMrrUsage} + + + +Accepted Arguments +---------------- +| Argument | Type | Default | Description | +| --- | --- | --- | --- | +| `weight_vector` | `float` | `0.5` | Weight for vector search results (0.0 to 1.0). | +| `weight_fts` | `float` | `0.5` | Weight for FTS search results (0.0 to 1.0). | +| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score`. If "all", will return all scores from the vector and FTS search along with the relevance score. | + +**Note:** `weight_vector` + `weight_fts` must equal 1.0. + + +## Supported Scores for each query type +You can specify the type of scores you want the reranker to return. The following are the supported scores for each query type: + +### Hybrid Search +|`return_score`| Status | Description | +| --- | --- | --- | +| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | +| `all` | ✅ Supported | Results have vector(`_distance`) and FTS(`score`) along with Hybrid Search score(`_relevance_score`). | + +### Multivector Search +|`return_score`| Status | Description | +| --- | --- | --- | +| `relevance` | ✅ Supported | Results only have the `_relevance_score` column. | +| `all` | ✅ Supported | Results have vector distances from all searches along with `_relevance_score`. | diff --git a/docs/integrations/reranking/openai.mdx b/docs/integrations/reranking/openai.mdx index b2d4db8..a68f1b9 100644 --- a/docs/integrations/reranking/openai.mdx +++ b/docs/integrations/reranking/openai.mdx @@ -26,8 +26,8 @@ Accepted Arguments | --- | --- | --- | --- | | `model_name` | `str` | `"gpt-4-turbo-preview"` | The name of the reranker model to use.| | `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. | -| `api_key` | str | `None` | The API key to use. If None, will use the OPENAI_API_KEY environment variable. +| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type. | +| `api_key` | `str` | `None` | The API key to use. If None, will use the OPENAI_API_KEY environment variable. ## Supported Scores for each query type diff --git a/docs/integrations/reranking/rrf.mdx b/docs/integrations/reranking/rrf.mdx index 0f8edbd..943eb78 100644 --- a/docs/integrations/reranking/rrf.mdx +++ b/docs/integrations/reranking/rrf.mdx @@ -26,7 +26,7 @@ Accepted Arguments | Argument | Type | Default | Description | | --- | --- | --- | --- | | `K` | `int` | `60` | A constant used in the RRF formula (default is 60). Experiments indicate that k = 60 was near-optimal, but that the choice is not critical. | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score`. If "all", will return all scores from the vector and FTS search along with the relevance score. | +| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score`. If "all", will return all scores from the vector and FTS search along with the relevance score. | ## Supported Scores for each query type diff --git a/docs/integrations/reranking/voyageai.mdx b/docs/integrations/reranking/voyageai.mdx index 3a3c028..704d2c5 100644 --- a/docs/integrations/reranking/voyageai.mdx +++ b/docs/integrations/reranking/voyageai.mdx @@ -31,7 +31,7 @@ Accepted Arguments | `column` | `str` | `"text"` | The name of the column to use as input to the cross encoder model. | | `top_n` | `str` | `None` | The number of results to return. If None, will return all results. | | `api_key` | `str` | `None` | The API key for the Voyage AI API. If not provided, the `VOYAGE_API_KEY` environment variable is used. | -| `return_score` | str | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type | +| `return_score` | `str` | `"relevance"` | Options are "relevance" or "all". The type of score to return. If "relevance", will return only the `_relevance_score. If "all" is supported, will return relevance score along with the vector and/or fts scores depending on query type | | `truncation` | `bool` | `None` | Whether to truncate the input to satisfy the "context length limit" on the query and the documents. | diff --git a/docs/snippets/integrations.mdx b/docs/snippets/integrations.mdx index 6b311a9..d30c979 100644 --- a/docs/snippets/integrations.mdx +++ b/docs/snippets/integrations.mdx @@ -1,9 +1,15 @@ {/* Auto-generated by scripts/mdx_snippets_gen.py. Do not edit manually. */} +export const PyRerankingMrrMultivector = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import MRRReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n meta: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n meta_vector: Vector(embedder.ndims()) = embedder.VectorField(source_column=\"meta\")\n\ndata = [\n {\"text\": \"hello world\", \"meta\": \"greeting message\"},\n {\"text\": \"goodbye world\", \"meta\": \"farewell message\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\n\n# Search across multiple vector columns and collect results with row IDs\nquery = \"hello\"\nrs1 = tbl.search(query, vector_column_name=\"vector\").limit(10).with_row_id(True).to_arrow()\nrs2 = tbl.search(query, vector_column_name=\"meta_vector\").limit(10).with_row_id(True).to_arrow()\n\n# Rerank the combined results using MRR\nreranker = MRRReranker()\ncombined_results = reranker.rerank_multivector([rs1, rs2])\n"; + export const PyEmbeddingAwsUsage = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nimport pandas as pd\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\n\nmodel = get_registry().get(\"bedrock-text\").create()\n\nclass TextModel(LanceModel):\n text: str = model.SourceField()\n vector: Vector(model.ndims()) = model.VectorField()\n\ndf = pd.DataFrame({\"text\": [\"hello world\", \"goodbye world\"]})\ndb = lancedb.connect(str(Path(tempfile.mkdtemp()) / \"bedrock-demo\"))\ntbl = db.create_table(\"test\", schema=TextModel, mode=\"overwrite\")\n\ntbl.add(df)\nrs = tbl.search(\"hello\").limit(1).to_pandas()\nprint(rs.head())\n"; export const PyEmbeddingCohereUsage = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nfrom lancedb.embeddings import EmbeddingFunctionRegistry\nfrom lancedb.pydantic import LanceModel, Vector\n\ncohere = (\n EmbeddingFunctionRegistry.get_instance()\n .get(\"cohere\")\n .create(name=\"embed-multilingual-v2.0\")\n)\n\nclass TextModel(LanceModel):\n text: str = cohere.SourceField()\n vector: Vector(cohere.ndims()) = cohere.VectorField()\n\ndata = [{\"text\": \"hello world\"}, {\"text\": \"goodbye world\"}]\n\ndb = lancedb.connect(str(Path(tempfile.mkdtemp()) / \"cohere-demo\"))\ntbl = db.create_table(\"test\", schema=TextModel, mode=\"overwrite\")\ntbl.add(data)\n"; +export const PyEmbeddingColpaliSetup = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nimport pandas as pd\nimport requests\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, MultiVector\n\ndb = lancedb.connect(str(Path(tempfile.mkdtemp()) / \"colpali-demo\"))\nfunc = get_registry().get(\"colpali\").create()\n\nclass Images(LanceModel):\n label: str\n image_uri: str = func.SourceField()\n image_bytes: bytes = func.SourceField()\n vector: MultiVector(func.ndims()) = func.VectorField()\n vec_from_bytes: MultiVector(func.ndims()) = func.VectorField()\n\ntable = db.create_table(\"images\", schema=Images)\nlabels = [\"cat\", \"dog\", \"horse\"]\nuris = [\n \"http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg\",\n \"http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg\",\n \"http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg\",\n]\nimage_bytes = [requests.get(uri).content for uri in uris]\ntable.add(\n pd.DataFrame({\"label\": labels, \"image_uri\": uris, \"image_bytes\": image_bytes})\n)\n"; + +export const PyEmbeddingColpaliTextSearch = "actual = (\n table.search(\"a furry pet\", vector_column_name=\"vector\")\n .limit(1)\n .to_pydantic(Images)[0]\n)\nprint(actual.label)\n"; + export const PyEmbeddingGeminiUsage = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nimport pandas as pd\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\n\nmodel = get_registry().get(\"gemini-text\").create()\n\nclass TextModel(LanceModel):\n text: str = model.SourceField()\n vector: Vector(model.ndims()) = model.VectorField()\n\ndf = pd.DataFrame({\"text\": [\"hello world\", \"goodbye world\"]})\ndb = lancedb.connect(str(Path(tempfile.mkdtemp()) / \"gemini-demo\"))\ntbl = db.create_table(\"test\", schema=TextModel, mode=\"overwrite\")\n\ntbl.add(df)\nrs = tbl.search(\"hello\").limit(1).to_pandas()\nprint(rs.head())\n"; export const PyEmbeddingHuggingfaceUsage = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nimport pandas as pd\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\n\ndb = lancedb.connect(str(Path(tempfile.mkdtemp()) / \"huggingface-demo\"))\nmodel = get_registry().get(\"huggingface\").create(name=\"facebook/bart-base\")\n\nclass Words(LanceModel):\n text: str = model.SourceField()\n vector: Vector(model.ndims()) = model.VectorField()\n\ndf = pd.DataFrame({\"text\": [\"hi hello sayonara\", \"goodbye world\"]})\ntable = db.create_table(\"greets\", schema=Words)\ntable.add(df)\nquery = \"old greeting\"\nactual = table.search(query).limit(1).to_pydantic(Words)[0]\nprint(actual.text)\n"; @@ -156,6 +162,8 @@ export const PyRerankingJinaUsage = "import os\n\nimport lancedb\nfrom lancedb.e export const PyRerankingLinearCombinationUsage = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import LinearCombinationReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n\ndata = [\n {\"text\": \"hello world\"},\n {\"text\": \"goodbye world\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\nreranker = LinearCombinationReranker()\n\n# Run hybrid search with a reranker\ntbl.create_fts_index(\"text\", replace=True)\nresult = (\n tbl.search(\"hello\", query_type=\"hybrid\").rerank(reranker=reranker).to_list()\n)\n"; +export const PyRerankingMrrUsage = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import MRRReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n\ndata = [\n {\"text\": \"hello world\"},\n {\"text\": \"goodbye world\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\nreranker = MRRReranker(weight_vector=0.7, weight_fts=0.3)\n\n# Run hybrid search with a reranker\ntbl.create_fts_index(\"text\", replace=True)\nresult = (\n tbl.search(\"hello\", query_type=\"hybrid\").rerank(reranker=reranker).to_list()\n)\n\n# Run multivector search across multiple vector columns\nrs1 = tbl.search(\"hello\").limit(10).with_row_id(True).to_arrow()\nrs2 = tbl.search(\"greeting\").limit(10).with_row_id(True).to_arrow()\ncombined = MRRReranker().rerank_multivector([rs1, rs2])\n"; + export const PyRerankingOpenaiUsage = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import OpenaiReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n\ndata = [\n {\"text\": \"hello world\"},\n {\"text\": \"goodbye world\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\nreranker = OpenaiReranker()\n\n# Run vector search with a reranker\nresult = tbl.search(\"hello\").rerank(reranker=reranker).to_list()\n\n# Run FTS search with a reranker\nresult = tbl.search(\"hello\", query_type=\"fts\").rerank(reranker=reranker).to_list()\n\n# Run hybrid search with a reranker\ntbl.create_fts_index(\"text\", replace=True)\nresult = (\n tbl.search(\"hello\", query_type=\"hybrid\").rerank(reranker=reranker).to_list()\n)\n"; export const PyRerankingRrfUsage = "import lancedb\nfrom lancedb.embeddings import get_registry\nfrom lancedb.pydantic import LanceModel, Vector\nfrom lancedb.rerankers import RRFReranker\n\nembedder = get_registry().get(\"sentence-transformers\").create()\ndb = lancedb.connect(\"~/.lancedb\")\n\nclass Schema(LanceModel):\n text: str = embedder.SourceField()\n vector: Vector(embedder.ndims()) = embedder.VectorField()\n\ndata = [\n {\"text\": \"hello world\"},\n {\"text\": \"goodbye world\"},\n]\ntbl = db.create_table(\"test\", schema=Schema, mode=\"overwrite\")\ntbl.add(data)\nreranker = RRFReranker()\n\n# Run hybrid search with a reranker\ntbl.create_fts_index(\"text\", replace=True)\nresult = (\n tbl.search(\"hello\", query_type=\"hybrid\").rerank(reranker=reranker).to_list()\n)\n"; diff --git a/tests/py/test_integrations.py b/tests/py/test_integrations.py index fc78707..43f408e 100644 --- a/tests/py/test_integrations.py +++ b/tests/py/test_integrations.py @@ -262,6 +262,53 @@ class ImageBindModel(LanceModel): # --8<-- [end:embedding_imagebind_text_search] +def test_embedding_colpali_examples() -> None: + require_flag("RUN_COLPALI_SNIPPETS") + pytest.importorskip("colpali_engine") + + # --8<-- [start:embedding_colpali_setup] + import tempfile + from pathlib import Path + + import lancedb + import pandas as pd + import requests + from lancedb.embeddings import get_registry + from lancedb.pydantic import LanceModel, MultiVector + + db = lancedb.connect(str(Path(tempfile.mkdtemp()) / "colpali-demo")) + func = get_registry().get("colpali").create() + + class Images(LanceModel): + label: str + image_uri: str = func.SourceField() + image_bytes: bytes = func.SourceField() + vector: MultiVector(func.ndims()) = func.VectorField() + vec_from_bytes: MultiVector(func.ndims()) = func.VectorField() + + table = db.create_table("images", schema=Images) + labels = ["cat", "dog", "horse"] + uris = [ + "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg", + "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg", + "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg", + ] + image_bytes = [requests.get(uri).content for uri in uris] + table.add( + pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": image_bytes}) + ) + # --8<-- [end:embedding_colpali_setup] + + # --8<-- [start:embedding_colpali_text_search] + actual = ( + table.search("a furry pet", vector_column_name="vector") + .limit(1) + .to_pydantic(Images)[0] + ) + print(actual.label) + # --8<-- [end:embedding_colpali_text_search] + + def test_embedding_instructor_usage() -> None: require_flag("RUN_INSTRUCTOR_SNIPPETS") @@ -888,6 +935,43 @@ class Schema(LanceModel): # --8<-- [end:reranking_rrf_usage] +def test_reranking_mrr_usage() -> None: + require_flag("RUN_RERANKER_SNIPPETS") + + # --8<-- [start:reranking_mrr_usage] + import lancedb + from lancedb.embeddings import get_registry + from lancedb.pydantic import LanceModel, Vector + from lancedb.rerankers import MRRReranker + + embedder = get_registry().get("sentence-transformers").create() + db = lancedb.connect("~/.lancedb") + + class Schema(LanceModel): + text: str = embedder.SourceField() + vector: Vector(embedder.ndims()) = embedder.VectorField() + + data = [ + {"text": "hello world"}, + {"text": "goodbye world"}, + ] + tbl = db.create_table("test", schema=Schema, mode="overwrite") + tbl.add(data) + reranker = MRRReranker(weight_vector=0.7, weight_fts=0.3) + + # Run hybrid search with a reranker + tbl.create_fts_index("text", replace=True) + result = ( + tbl.search("hello", query_type="hybrid").rerank(reranker=reranker).to_list() + ) + + # Run multivector search across multiple vector columns + rs1 = tbl.search("hello").limit(10).with_row_id(True).to_arrow() + rs2 = tbl.search("greeting").limit(10).with_row_id(True).to_arrow() + combined = MRRReranker().rerank_multivector([rs1, rs2]) + # --8<-- [end:reranking_mrr_usage] + + def test_reranking_voyageai_usage() -> None: require_flag("RUN_RERANKER_SNIPPETS") os.environ["VOYAGE_API_KEY"] = require_env("VOYAGE_API_KEY")