|
| 1 | +"""Base embeddings callable utilities for RAG systems.""" |
| 2 | + |
| 3 | +from typing import Protocol, TypeVar, runtime_checkable |
| 4 | + |
| 5 | +import numpy as np |
| 6 | + |
| 7 | +from crewai.rag.core.types import ( |
| 8 | + Embeddable, |
| 9 | + Embedding, |
| 10 | + Embeddings, |
| 11 | + PyEmbedding, |
| 12 | +) |
| 13 | + |
| 14 | +T = TypeVar("T") |
| 15 | +D = TypeVar("D", bound=Embeddable, contravariant=True) |
| 16 | + |
| 17 | + |
| 18 | +def normalize_embeddings( |
| 19 | + target: Embedding | list[Embedding] | PyEmbedding | list[PyEmbedding], |
| 20 | +) -> Embeddings | None: |
| 21 | + """Normalize various embedding formats to a standard list of numpy arrays. |
| 22 | +
|
| 23 | + Args: |
| 24 | + target: Input embeddings in various formats (list of floats, list of lists, |
| 25 | + numpy array, or list of numpy arrays). |
| 26 | +
|
| 27 | + Returns: |
| 28 | + Normalized embeddings as a list of numpy arrays, or None if input is None. |
| 29 | +
|
| 30 | + Raises: |
| 31 | + ValueError: If embeddings are empty or in an unsupported format. |
| 32 | + """ |
| 33 | + if isinstance(target, np.ndarray): |
| 34 | + if target.ndim == 1: |
| 35 | + return [target.astype(np.float32)] |
| 36 | + if target.ndim == 2: |
| 37 | + return [row.astype(np.float32) for row in target] |
| 38 | + raise ValueError(f"Unsupported numpy array shape: {target.shape}") |
| 39 | + |
| 40 | + first = target[0] |
| 41 | + if isinstance(first, (int, float)) and not isinstance(first, bool): |
| 42 | + return [np.array(target, dtype=np.float32)] |
| 43 | + if isinstance(first, list): |
| 44 | + return [np.array(emb, dtype=np.float32) for emb in target] |
| 45 | + if isinstance(first, np.ndarray): |
| 46 | + return [emb.astype(np.float32) for emb in target] # type: ignore[union-attr] |
| 47 | + |
| 48 | + raise ValueError(f"Unsupported embeddings format: {type(first)}") |
| 49 | + |
| 50 | + |
| 51 | +def maybe_cast_one_to_many(target: T | list[T] | None) -> list[T] | None: |
| 52 | + """Cast a single item to a list if needed. |
| 53 | +
|
| 54 | + Args: |
| 55 | + target: A single item or list of items. |
| 56 | +
|
| 57 | + Returns: |
| 58 | + A list of items or None if input is None. |
| 59 | + """ |
| 60 | + if target is None: |
| 61 | + return None |
| 62 | + return target if isinstance(target, list) else [target] |
| 63 | + |
| 64 | + |
| 65 | +def validate_embeddings(embeddings: Embeddings) -> Embeddings: |
| 66 | + """Validate embeddings format and content. |
| 67 | +
|
| 68 | + Args: |
| 69 | + embeddings: List of numpy arrays to validate. |
| 70 | +
|
| 71 | + Returns: |
| 72 | + Validated embeddings. |
| 73 | +
|
| 74 | + Raises: |
| 75 | + ValueError: If embeddings format or content is invalid. |
| 76 | + """ |
| 77 | + if not isinstance(embeddings, list): |
| 78 | + raise ValueError( |
| 79 | + f"Expected embeddings to be a list, got {type(embeddings).__name__}" |
| 80 | + ) |
| 81 | + if len(embeddings) == 0: |
| 82 | + raise ValueError( |
| 83 | + f"Expected embeddings to be a list with at least one item, got {len(embeddings)} embeddings" |
| 84 | + ) |
| 85 | + if not all(isinstance(e, np.ndarray) for e in embeddings): |
| 86 | + raise ValueError( |
| 87 | + "Expected each embedding in the embeddings to be a numpy array" |
| 88 | + ) |
| 89 | + for i, embedding in enumerate(embeddings): |
| 90 | + if embedding.ndim == 0: |
| 91 | + raise ValueError( |
| 92 | + f"Expected a 1-dimensional array, got a 0-dimensional array {embedding}" |
| 93 | + ) |
| 94 | + if embedding.size == 0: |
| 95 | + raise ValueError( |
| 96 | + f"Expected each embedding to be a 1-dimensional numpy array with at least 1 value. " |
| 97 | + f"Got an array with no values at position {i}" |
| 98 | + ) |
| 99 | + if not all( |
| 100 | + isinstance(value, (np.integer, float, np.floating)) |
| 101 | + and not isinstance(value, bool) |
| 102 | + for value in embedding |
| 103 | + ): |
| 104 | + raise ValueError( |
| 105 | + f"Expected embedding to contain numeric values, got non-numeric values at position {i}" |
| 106 | + ) |
| 107 | + return embeddings |
| 108 | + |
| 109 | + |
| 110 | +@runtime_checkable |
| 111 | +class EmbeddingFunction(Protocol[D]): |
| 112 | + """Protocol for embedding functions. |
| 113 | +
|
| 114 | + Embedding functions convert input data (documents or images) into vector embeddings. |
| 115 | + """ |
| 116 | + |
| 117 | + def __call__(self, input: D) -> Embeddings: |
| 118 | + """Convert input data to embeddings. |
| 119 | +
|
| 120 | + Args: |
| 121 | + input: Input data to embed (documents or images). |
| 122 | +
|
| 123 | + Returns: |
| 124 | + List of numpy arrays representing the embeddings. |
| 125 | + """ |
| 126 | + ... |
| 127 | + |
| 128 | + def __init_subclass__(cls) -> None: |
| 129 | + """Wrap __call__ method to normalize and validate embeddings.""" |
| 130 | + super().__init_subclass__() |
| 131 | + original_call = cls.__call__ |
| 132 | + |
| 133 | + def wrapped_call(self: EmbeddingFunction[D], input: D) -> Embeddings: |
| 134 | + result = original_call(self, input) |
| 135 | + if result is None: |
| 136 | + raise ValueError("Embedding function returned None") |
| 137 | + normalized = normalize_embeddings(result) |
| 138 | + if normalized is None: |
| 139 | + raise ValueError("Normalization returned None for non-None input") |
| 140 | + return validate_embeddings(normalized) |
| 141 | + |
| 142 | + cls.__call__ = wrapped_call # type: ignore[method-assign] |
0 commit comments