From f0aa1634043a0ae500fb0a8a59954b31ebc0466f Mon Sep 17 00:00:00 2001 From: uzaxirr Date: Tue, 28 Oct 2025 06:55:57 +0530 Subject: [PATCH 1/6] docs: add vLLM embedder documentation --- _snippets/embedder-vllm-reference.mdx | 12 ++ concepts/knowledge/embedder/vllm.mdx | 139 ++++++++++++++++++ docs.json | 3 + .../knowledge/embedders/vllm-embedder.mdx | 84 +++++++++++ reference/knowledge/embedder/vllm.mdx | 7 + 5 files changed, 245 insertions(+) create mode 100644 _snippets/embedder-vllm-reference.mdx create mode 100644 concepts/knowledge/embedder/vllm.mdx create mode 100644 examples/concepts/knowledge/embedders/vllm-embedder.mdx create mode 100644 reference/knowledge/embedder/vllm.mdx diff --git a/_snippets/embedder-vllm-reference.mdx b/_snippets/embedder-vllm-reference.mdx new file mode 100644 index 00000000..8ad492bb --- /dev/null +++ b/_snippets/embedder-vllm-reference.mdx @@ -0,0 +1,12 @@ +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `id` | `str` | Model identifier (HuggingFace model name) | `"intfloat/e5-mistral-7b-instruct"` | +| `dimensions` | `int` | Embedding vector dimensions | `4096` | +| `base_url` | `Optional[str]` | Remote vLLM server URL (enables remote mode) | `None` | +| `api_key` | `Optional[str]` | API key for remote server authentication | `getenv("VLLM_API_KEY")` | +| `enable_batch` | `bool` | Enable batch processing for multiple texts | `False` | +| `batch_size` | `int` | Number of texts to process per batch | `10` | +| `enforce_eager` | `bool` | Use eager execution mode (local mode) | `True` | +| `vllm_kwargs` | `Optional[Dict[str, Any]]` | Additional vLLM engine parameters (local mode) | `None` | +| `request_params` | `Optional[Dict[str, Any]]` | Additional request parameters (remote mode) | `None` | +| `client_params` | `Optional[Dict[str, Any]]` | OpenAI client configuration (remote mode) | `None` | diff --git a/concepts/knowledge/embedder/vllm.mdx b/concepts/knowledge/embedder/vllm.mdx new file mode 100644 index 00000000..2c7cd943 --- /dev/null +++ b/concepts/knowledge/embedder/vllm.mdx @@ -0,0 +1,139 @@ +--- +title: vLLM Embedder +--- + +vLLM Embedder supports both local and remote embedding model deployment with high-performance inference optimized for throughput and latency. + +## Prerequisites + +vLLM requires Python 3.8+ and GPU support for optimal performance. + +Install vLLM: + +```bash +pip install vllm +``` + +## Deployment Modes + +vLLM Embedder supports two deployment modes: + +### Local Mode + +Direct model loading with the vLLM library. No server required. + +```python +from agno.knowledge.embedder.vllm import VLLMEmbedder + +embedder = VLLMEmbedder( + id="intfloat/e5-mistral-7b-instruct", + dimensions=4096 +) + +# Get embeddings +embedding = embedder.get_embedding("Hello world") +``` + +**Use Cases:** +- Development and testing +- Single-machine deployment +- GPU/CPU inference + +**GPU Requirements:** +- e5-mistral-7b-instruct: ~14GB VRAM +- BAAI/bge-large-en-v1.5: ~2GB VRAM +- sentence-transformers/all-MiniLM-L6-v2: ~500MB VRAM + +### Remote Mode + +Connects to a running vLLM server via OpenAI-compatible API. + +```python +embedder = VLLMEmbedder( + base_url="http://localhost:8000/v1", + api_key="your-key" # Optional +) +``` + +**Use Cases:** +- Production deployments +- Shared infrastructure +- Horizontal scaling +- Load balancing + +## Recommended Models + +| Model | Dimensions | Parameters | VRAM | Use Case | +|-------|------------|------------|------|----------| +| `intfloat/e5-mistral-7b-instruct` | 4096 | 7B | ~14GB | High-quality embeddings | +| `BAAI/bge-large-en-v1.5` | 1024 | 335M | ~2GB | Balanced performance | +| `sentence-transformers/all-MiniLM-L6-v2` | 384 | 22M | ~500MB | Fast, lightweight | + +## Performance Optimization + +### Batching + +Enable batching for processing multiple texts efficiently: + +```python +embedder = VLLMEmbedder( + id="intfloat/e5-mistral-7b-instruct", + enable_batch=True, + batch_size=32 # Adjust based on GPU memory +) +``` + +### Async Processing + +Use async methods for concurrent operations: + +```python +import asyncio + +async def get_embeddings(): + embeddings = await embedder.async_get_embedding("Hello world") + return embeddings + +embeddings = asyncio.run(get_embeddings()) +``` + +## Usage with Knowledge Base + +Integrate with Agno's knowledge system: + +```python +from agno.knowledge.pdf import PDFKnowledgeBase +from agno.vectordb.pgvector import PgVector + +knowledge_base = PDFKnowledgeBase( + path="data/pdfs", + vector_db=PgVector( + table_name="vllm_documents", + db_url="postgresql+psycopg://ai:ai@localhost:5532/ai", + embedder=VLLMEmbedder( + id="intfloat/e5-mistral-7b-instruct", + dimensions=4096, + ), + ), +) +``` + +## Params + + + +## Troubleshooting + +### Out of Memory Error +- Use a smaller model (e.g., `bge-small`, `MiniLM`) +- Reduce batch size +- Enable CPU offloading: `vllm_kwargs={"enforce_eager": False}` + +### Model Download Issues +- Models are downloaded from HuggingFace on first use +- Set `HF_HOME` environment variable to control cache location +- Pre-download: `huggingface-cli download intfloat/e5-mistral-7b-instruct` + +### Import Errors +- Ensure vLLM is installed: `pip install vllm` +- For GPU support, verify CUDA installation diff --git a/docs.json b/docs.json index 8664837b..91c1245e 100644 --- a/docs.json +++ b/docs.json @@ -333,6 +333,7 @@ "concepts/knowledge/embedder/qdrant_fastembed", "concepts/knowledge/embedder/sentencetransformers", "concepts/knowledge/embedder/together", + "concepts/knowledge/embedder/vllm", "concepts/knowledge/embedder/voyageai", "concepts/knowledge/embedder/aws_bedrock" ] @@ -1639,6 +1640,7 @@ "examples/concepts/knowledge/embedders/nebius-embedder", "examples/concepts/knowledge/embedders/sentence-transformer-embedder", "examples/concepts/knowledge/embedders/together-embedder", + "examples/concepts/knowledge/embedders/vllm-embedder", "examples/concepts/knowledge/embedders/voyageai-embedder" ] }, @@ -2996,6 +2998,7 @@ "reference/knowledge/embedder/openai", "reference/knowledge/embedder/sentence-transformer", "reference/knowledge/embedder/together", + "reference/knowledge/embedder/vllm", "reference/knowledge/embedder/voyageai" ] }, diff --git a/examples/concepts/knowledge/embedders/vllm-embedder.mdx b/examples/concepts/knowledge/embedders/vllm-embedder.mdx new file mode 100644 index 00000000..25708ba3 --- /dev/null +++ b/examples/concepts/knowledge/embedders/vllm-embedder.mdx @@ -0,0 +1,84 @@ +--- +title: vLLM Embedder +--- + +## Code + +```python cookbook/knowledge/embedders/vllm_embedder.py +from agno.agent import Agent +from agno.knowledge.pdf import PDFKnowledgeBase, PDFReader +from agno.knowledge.embedder.vllm import VLLMEmbedder +from agno.models.openai import OpenAIChat +from agno.vectordb.pgvector import PgVector + +# Create knowledge base with vLLM embedder (local mode) +knowledge_base = PDFKnowledgeBase( + path="data/pdfs", + vector_db=PgVector( + table_name="vllm_documents", + db_url="postgresql+psycopg://ai:ai@localhost:5532/ai", + embedder=VLLMEmbedder( + id="intfloat/e5-mistral-7b-instruct", + dimensions=4096, + ), + ), + reader=PDFReader(chunk=True), +) +knowledge_base.load(recreate=False) + +# Create agent with knowledge +agent = Agent( + model=OpenAIChat(id="gpt-4o"), + knowledge=knowledge_base, + search_knowledge=True, + show_tool_calls=True, + markdown=True, +) + +agent.print_response("What is the main topic?", markdown=True) +``` + +## Usage + + + + + + ```bash + pip install -U agno vllm openai sqlalchemy psycopg[binary] pgvector pypdf + ``` + + + + ```bash + export OPENAI_API_KEY=xxx + ``` + + + + ```bash + docker run -d \ + -e POSTGRES_DB=ai \ + -e POSTGRES_USER=ai \ + -e POSTGRES_PASSWORD=ai \ + -e PGDATA=/var/lib/postgresql/data/pgdata \ + -v pgvolume:/var/lib/postgresql/data \ + -p 5532:5432 \ + --name pgvector \ + agno/pgvector:16 + ``` + + + + ```bash + python cookbook/knowledge/embedders/vllm_embedder.py + ``` + + + +## Notes + +- This example uses **local mode** where vLLM loads the model directly (no server needed) +- For **remote mode**, use `base_url` parameter: `VLLMEmbedder(base_url="http://localhost:8000/v1")` +- GPU with ~14GB VRAM required for e5-mistral-7b-instruct model +- For CPU-only or lower memory, use smaller models like `BAAI/bge-small-en-v1.5` diff --git a/reference/knowledge/embedder/vllm.mdx b/reference/knowledge/embedder/vllm.mdx new file mode 100644 index 00000000..330c5b45 --- /dev/null +++ b/reference/knowledge/embedder/vllm.mdx @@ -0,0 +1,7 @@ +--- +title: vLLM +--- + +vLLM Embedder for local and remote embedding models with high-performance inference. + + From 8df4c1716c3f3be1733701f39152fff5da7e0eb9 Mon Sep 17 00:00:00 2001 From: Uzair Ali <72073401+uzaxirr@users.noreply.github.com> Date: Tue, 28 Oct 2025 09:04:58 +0530 Subject: [PATCH 2/6] Update concepts/knowledge/embedder/vllm.mdx Co-authored-by: Dirk Brand <51947788+dirkbrnd@users.noreply.github.com> --- concepts/knowledge/embedder/vllm.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/concepts/knowledge/embedder/vllm.mdx b/concepts/knowledge/embedder/vllm.mdx index 2c7cd943..e7133270 100644 --- a/concepts/knowledge/embedder/vllm.mdx +++ b/concepts/knowledge/embedder/vllm.mdx @@ -20,7 +20,7 @@ vLLM Embedder supports two deployment modes: ### Local Mode -Direct model loading with the vLLM library. No server required. +You can directly load local models using the vLLM library, without any need to host a model on a server. ```python from agno.knowledge.embedder.vllm import VLLMEmbedder From 4d9ea1d9dc687023c441c4c1023a1cccdec39ca0 Mon Sep 17 00:00:00 2001 From: Uzair Ali <72073401+uzaxirr@users.noreply.github.com> Date: Tue, 28 Oct 2025 09:05:06 +0530 Subject: [PATCH 3/6] Update concepts/knowledge/embedder/vllm.mdx Co-authored-by: Dirk Brand <51947788+dirkbrnd@users.noreply.github.com> --- concepts/knowledge/embedder/vllm.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/concepts/knowledge/embedder/vllm.mdx b/concepts/knowledge/embedder/vllm.mdx index e7133270..bcdd8bb9 100644 --- a/concepts/knowledge/embedder/vllm.mdx +++ b/concepts/knowledge/embedder/vllm.mdx @@ -39,7 +39,7 @@ embedding = embedder.get_embedding("Hello world") - Single-machine deployment - GPU/CPU inference -**GPU Requirements:** +**Approximate GPU Requirements:** - e5-mistral-7b-instruct: ~14GB VRAM - BAAI/bge-large-en-v1.5: ~2GB VRAM - sentence-transformers/all-MiniLM-L6-v2: ~500MB VRAM From 519e7d7936fb9551d5118347d5ba946bafb398f2 Mon Sep 17 00:00:00 2001 From: Uzair Ali <72073401+uzaxirr@users.noreply.github.com> Date: Tue, 28 Oct 2025 09:05:15 +0530 Subject: [PATCH 4/6] Update concepts/knowledge/embedder/vllm.mdx Co-authored-by: Dirk Brand <51947788+dirkbrnd@users.noreply.github.com> --- concepts/knowledge/embedder/vllm.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/concepts/knowledge/embedder/vllm.mdx b/concepts/knowledge/embedder/vllm.mdx index bcdd8bb9..48716592 100644 --- a/concepts/knowledge/embedder/vllm.mdx +++ b/concepts/knowledge/embedder/vllm.mdx @@ -46,7 +46,7 @@ embedding = embedder.get_embedding("Hello world") ### Remote Mode -Connects to a running vLLM server via OpenAI-compatible API. +You can connect to a running vLLM server via an OpenAI-compatible API. ```python embedder = VLLMEmbedder( From 31fd5b5888499a192dacf0f058b3672800cb1aa1 Mon Sep 17 00:00:00 2001 From: uzaxirr Date: Tue, 28 Oct 2025 17:17:04 +0530 Subject: [PATCH 5/6] cmnts --- concepts/knowledge/embedder/vllm.mdx | 164 ++++++------------ .../knowledge/embedders/vllm-embedder.mdx | 111 ++++++++---- reference/knowledge/embedder/vllm.mdx | 48 ++++- 3 files changed, 176 insertions(+), 147 deletions(-) diff --git a/concepts/knowledge/embedder/vllm.mdx b/concepts/knowledge/embedder/vllm.mdx index 48716592..4277f692 100644 --- a/concepts/knowledge/embedder/vllm.mdx +++ b/concepts/knowledge/embedder/vllm.mdx @@ -1,139 +1,89 @@ --- title: vLLM Embedder +sidebarTitle: vLLM --- -vLLM Embedder supports both local and remote embedding model deployment with high-performance inference optimized for throughput and latency. +The vLLM Embedder provides high-performance embedding inference with support for both local and remote deployment modes. All models are downloaded from HuggingFace. -## Prerequisites - -vLLM requires Python 3.8+ and GPU support for optimal performance. - -Install vLLM: - -```bash -pip install vllm -``` - -## Deployment Modes - -vLLM Embedder supports two deployment modes: +## Usage ### Local Mode You can directly load local models using the vLLM library, without any need to host a model on a server. -```python +```python vllm_embedder.py from agno.knowledge.embedder.vllm import VLLMEmbedder +from agno.knowledge.knowledge import Knowledge +from agno.vectordb.pgvector import PgVector -embedder = VLLMEmbedder( +# Get embeddings directly +embeddings = VLLMEmbedder( id="intfloat/e5-mistral-7b-instruct", - dimensions=4096 + dimensions=4096, + enforce_eager=True, + vllm_kwargs={ + "disable_sliding_window": True, + "max_model_len": 4096, + }, +).get_embedding("The quick brown fox jumps over the lazy dog.") + +print(f"Embeddings: {embeddings[:5]}") +print(f"Dimensions: {len(embeddings)}") + +# Use with Knowledge +knowledge = Knowledge( + vector_db=PgVector( + db_url="postgresql+psycopg://ai:ai@localhost:5532/ai", + table_name="vllm_embeddings", + embedder=VLLMEmbedder( + id="intfloat/e5-mistral-7b-instruct", + dimensions=4096, + enforce_eager=True, + vllm_kwargs={ + "disable_sliding_window": True, + "max_model_len": 4096, + }, + ), + ), + max_results=2, ) - -# Get embeddings -embedding = embedder.get_embedding("Hello world") ``` -**Use Cases:** -- Development and testing -- Single-machine deployment -- GPU/CPU inference - -**Approximate GPU Requirements:** -- e5-mistral-7b-instruct: ~14GB VRAM -- BAAI/bge-large-en-v1.5: ~2GB VRAM -- sentence-transformers/all-MiniLM-L6-v2: ~500MB VRAM - ### Remote Mode You can connect to a running vLLM server via an OpenAI-compatible API. -```python -embedder = VLLMEmbedder( - base_url="http://localhost:8000/v1", - api_key="your-key" # Optional -) -``` - -**Use Cases:** -- Production deployments -- Shared infrastructure -- Horizontal scaling -- Load balancing - -## Recommended Models - -| Model | Dimensions | Parameters | VRAM | Use Case | -|-------|------------|------------|------|----------| -| `intfloat/e5-mistral-7b-instruct` | 4096 | 7B | ~14GB | High-quality embeddings | -| `BAAI/bge-large-en-v1.5` | 1024 | 335M | ~2GB | Balanced performance | -| `sentence-transformers/all-MiniLM-L6-v2` | 384 | 22M | ~500MB | Fast, lightweight | - -## Performance Optimization - -### Batching - -Enable batching for processing multiple texts efficiently: - -```python -embedder = VLLMEmbedder( - id="intfloat/e5-mistral-7b-instruct", - enable_batch=True, - batch_size=32 # Adjust based on GPU memory -) -``` - -### Async Processing - -Use async methods for concurrent operations: - -```python -import asyncio - -async def get_embeddings(): - embeddings = await embedder.async_get_embedding("Hello world") - return embeddings - -embeddings = asyncio.run(get_embeddings()) -``` - -## Usage with Knowledge Base - -Integrate with Agno's knowledge system: - -```python -from agno.knowledge.pdf import PDFKnowledgeBase -from agno.vectordb.pgvector import PgVector - -knowledge_base = PDFKnowledgeBase( - path="data/pdfs", +```python vllm_embedder_remote.py +# Remote mode (for production deployments) +knowledge_remote = Knowledge( vector_db=PgVector( - table_name="vllm_documents", db_url="postgresql+psycopg://ai:ai@localhost:5532/ai", + table_name="vllm_embeddings_remote", embedder=VLLMEmbedder( id="intfloat/e5-mistral-7b-instruct", dimensions=4096, + base_url="http://localhost:8000/v1", # Example endpoint for local development + api_key="your-api-key", # Optional ), ), + max_results=2, ) ``` ## Params - - -## Troubleshooting - -### Out of Memory Error -- Use a smaller model (e.g., `bge-small`, `MiniLM`) -- Reduce batch size -- Enable CPU offloading: `vllm_kwargs={"enforce_eager": False}` - -### Model Download Issues -- Models are downloaded from HuggingFace on first use -- Set `HF_HOME` environment variable to control cache location -- Pre-download: `huggingface-cli download intfloat/e5-mistral-7b-instruct` - -### Import Errors -- Ensure vLLM is installed: `pip install vllm` -- For GPU support, verify CUDA installation +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `id` | `str` | `"intfloat/e5-mistral-7b-instruct"` | Model identifier (HuggingFace model name) | +| `dimensions` | `int` | `4096` | Embedding vector dimensions | +| `base_url` | `Optional[str]` | `None` | Remote vLLM server URL (enables remote mode) | +| `api_key` | `Optional[str]` | `getenv("VLLM_API_KEY")` | API key for remote server authentication | +| `enable_batch` | `bool` | `False` | Enable batch processing for multiple texts | +| `batch_size` | `int` | `10` | Number of texts to process per batch | +| `enforce_eager` | `bool` | `True` | Use eager execution mode (local mode) | +| `vllm_kwargs` | `Optional[Dict[str, Any]]` | `None` | Additional vLLM engine parameters (local mode) | +| `request_params` | `Optional[Dict[str, Any]]` | `None` | Additional request parameters (remote mode) | +| `client_params` | `Optional[Dict[str, Any]]` | `None` | OpenAI client configuration (remote mode) | + +## Developer Resources +- View [Cookbook](https://github.com/agno-agi/agno/tree/main/cookbook/knowledge/embedders/vllm_embedder.py) diff --git a/examples/concepts/knowledge/embedders/vllm-embedder.mdx b/examples/concepts/knowledge/embedders/vllm-embedder.mdx index 25708ba3..cc7cac8c 100644 --- a/examples/concepts/knowledge/embedders/vllm-embedder.mdx +++ b/examples/concepts/knowledge/embedders/vllm-embedder.mdx @@ -4,38 +4,72 @@ title: vLLM Embedder ## Code -```python cookbook/knowledge/embedders/vllm_embedder.py -from agno.agent import Agent -from agno.knowledge.pdf import PDFKnowledgeBase, PDFReader +```python vllm_embedder.py +import asyncio + from agno.knowledge.embedder.vllm import VLLMEmbedder -from agno.models.openai import OpenAIChat +from agno.knowledge.knowledge import Knowledge from agno.vectordb.pgvector import PgVector -# Create knowledge base with vLLM embedder (local mode) -knowledge_base = PDFKnowledgeBase( - path="data/pdfs", - vector_db=PgVector( - table_name="vllm_documents", - db_url="postgresql+psycopg://ai:ai@localhost:5532/ai", - embedder=VLLMEmbedder( - id="intfloat/e5-mistral-7b-instruct", - dimensions=4096, + +def main(): + # Basic usage - get embeddings directly + embeddings = VLLMEmbedder( + id="intfloat/e5-mistral-7b-instruct", + dimensions=4096, + enforce_eager=True, + vllm_kwargs={ + "disable_sliding_window": True, + "max_model_len": 4096, + }, + ).get_embedding("The quick brown fox jumps over the lazy dog.") + + # Print the embeddings and their dimensions + print(f"Embeddings: {embeddings[:5]}") + print(f"Dimensions: {len(embeddings)}") + + # Local Mode with Knowledge + knowledge = Knowledge( + vector_db=PgVector( + db_url="postgresql+psycopg://ai:ai@localhost:5532/ai", + table_name="vllm_embeddings", + embedder=VLLMEmbedder( + id="intfloat/e5-mistral-7b-instruct", + dimensions=4096, + enforce_eager=True, + vllm_kwargs={ + "disable_sliding_window": True, + "max_model_len": 4096, + }, + ), ), - ), - reader=PDFReader(chunk=True), -) -knowledge_base.load(recreate=False) - -# Create agent with knowledge -agent = Agent( - model=OpenAIChat(id="gpt-4o"), - knowledge=knowledge_base, - search_knowledge=True, - show_tool_calls=True, - markdown=True, -) - -agent.print_response("What is the main topic?", markdown=True) + max_results=2, + ) + + # Remote mode with Knowledge + knowledge_remote = Knowledge( + vector_db=PgVector( + db_url="postgresql+psycopg://ai:ai@localhost:5532/ai", + table_name="vllm_embeddings_remote", + embedder=VLLMEmbedder( + id="intfloat/e5-mistral-7b-instruct", + dimensions=4096, + base_url="http://localhost:8000/v1", + api_key="your-api-key", # Optional + ), + ), + max_results=2, + ) + + asyncio.run( + knowledge.add_content_async( + path="cookbook/knowledge/testing_resources/cv_1.pdf", + ) + ) + + +if __name__ == "__main__": + main() ``` ## Usage @@ -49,12 +83,6 @@ agent.print_response("What is the main topic?", markdown=True) ``` - - ```bash - export OPENAI_API_KEY=xxx - ``` - - ```bash docker run -d \ @@ -69,16 +97,23 @@ agent.print_response("What is the main topic?", markdown=True) ``` - - ```bash - python cookbook/knowledge/embedders/vllm_embedder.py + + + ```bash Mac + python vllm_embedder.py + ``` + + ```bash Windows + python vllm_embedder.py ``` + ## Notes - This example uses **local mode** where vLLM loads the model directly (no server needed) -- For **remote mode**, use `base_url` parameter: `VLLMEmbedder(base_url="http://localhost:8000/v1")` +- For **remote mode**, the code includes `knowledge_remote` example with `base_url` parameter - GPU with ~14GB VRAM required for e5-mistral-7b-instruct model - For CPU-only or lower memory, use smaller models like `BAAI/bge-small-en-v1.5` +- Models are automatically downloaded from HuggingFace on first use diff --git a/reference/knowledge/embedder/vllm.mdx b/reference/knowledge/embedder/vllm.mdx index 330c5b45..5257def9 100644 --- a/reference/knowledge/embedder/vllm.mdx +++ b/reference/knowledge/embedder/vllm.mdx @@ -2,6 +2,50 @@ title: vLLM --- -vLLM Embedder for local and remote embedding models with high-performance inference. +The vLLM Embedder provides high-performance embedding inference with support for both local and remote deployment modes. It can load models directly for local inference or connect to a remote vLLM server via an OpenAI-compatible API. - +## Usage + +```python +from agno.knowledge.embedder.vllm import VLLMEmbedder +from agno.knowledge.knowledge import Knowledge +from agno.vectordb.pgvector import PgVector + +# Local mode +embedder = VLLMEmbedder( + id="intfloat/e5-mistral-7b-instruct", + dimensions=4096, + enforce_eager=True, + vllm_kwargs={ + "disable_sliding_window": True, + "max_model_len": 4096, + }, +) + +# Use with Knowledge +knowledge = Knowledge( + vector_db=PgVector( + db_url="postgresql+psycopg://ai:ai@localhost:5532/ai", + table_name="vllm_embeddings", + embedder=embedder, + ), +) +``` + +## Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `id` | `str` | `"intfloat/e5-mistral-7b-instruct"` | Model identifier (HuggingFace model name) | +| `dimensions` | `int` | `4096` | Embedding vector dimensions | +| `base_url` | `Optional[str]` | `None` | Remote vLLM server URL (enables remote mode) | +| `api_key` | `Optional[str]` | `getenv("VLLM_API_KEY")` | API key for remote server authentication | +| `enable_batch` | `bool` | `False` | Enable batch processing for multiple texts | +| `batch_size` | `int` | `10` | Number of texts to process per batch | +| `enforce_eager` | `bool` | `True` | Use eager execution mode (local mode) | +| `vllm_kwargs` | `Optional[Dict[str, Any]]` | `None` | Additional vLLM engine parameters (local mode) | +| `request_params` | `Optional[Dict[str, Any]]` | `None` | Additional request parameters (remote mode) | +| `client_params` | `Optional[Dict[str, Any]]` | `None` | OpenAI client configuration (remote mode) | + +## Developer Resources +- View [Cookbook](https://github.com/agno-agi/agno/tree/main/cookbook/knowledge/embedders/vllm_embedder.py) From 1602d4e07e6bab0878fd1530ec2ca1f6507f9c9d Mon Sep 17 00:00:00 2001 From: Dirk Brand <51947788+dirkbrnd@users.noreply.github.com> Date: Tue, 28 Oct 2025 16:02:00 +0200 Subject: [PATCH 6/6] Apply suggestion from @dirkbrnd --- concepts/knowledge/embedder/vllm.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/concepts/knowledge/embedder/vllm.mdx b/concepts/knowledge/embedder/vllm.mdx index 4277f692..6a61b92f 100644 --- a/concepts/knowledge/embedder/vllm.mdx +++ b/concepts/knowledge/embedder/vllm.mdx @@ -9,7 +9,7 @@ The vLLM Embedder provides high-performance embedding inference with support for ### Local Mode -You can directly load local models using the vLLM library, without any need to host a model on a server. +You can load local models directly using the vLLM library, without any need to host a model on a server. ```python vllm_embedder.py from agno.knowledge.embedder.vllm import VLLMEmbedder