microcore.embedding_db
1import logging 2import sys 3from abc import ABC, abstractmethod 4from dataclasses import dataclass 5 6import tiktoken 7 8from ..utils import ExtendedString 9 10 11class SearchResults(list): 12 def fit_to_token_size( 13 self, 14 max_tokens: int, 15 for_model: str = None, 16 encoding: str | tiktoken.Encoding = None, 17 verbose=True 18 ): 19 from ..tokenizing import fit_to_token_size 20 records, removed = fit_to_token_size(self, max_tokens, for_model, encoding) 21 if verbose and len(records) < len(self): 22 logging.info( 23 "For fitting %d records to %d tokens, %d records was removed", 24 len(self), 25 max_tokens, 26 removed 27 ) 28 return SearchResults(list(records)) 29 30 31class SearchResult(ExtendedString): 32 """ 33 String containing the search result with additional information in attributes 34 35 Attributes: 36 id (str): document (text) identifier in embedding database 37 distance (float): The distance between the query and the search result 38 metadata (dict): A dictionary containing document metadata 39 """ 40 41 id: str 42 distance: float 43 metadata: dict 44 45 46@dataclass 47class AbstractEmbeddingDB(ABC): 48 """ 49 Base class for embedding databases 50 """ 51 52 @abstractmethod 53 def search( 54 self, 55 collection: str, 56 query: str | list, 57 n_results: int = 5, 58 where: dict = None, 59 **kwargs, 60 ) -> list[str | SearchResult]: 61 """ 62 Similarity search 63 64 Args: 65 collection (str): collection name 66 query (str | list): query string or list of query strings 67 n_results (int): number of results to return 68 where (dict): filter results by metadata 69 **kwargs: additional arguments 70 """ 71 72 def find(self, *args, **kwargs) -> SearchResults | list[str | SearchResult]: 73 """ 74 Alias for `search` 75 """ 76 return self.search(*args, **kwargs) 77 78 def find_all( 79 self, 80 collection: str, 81 query: str | list, 82 where: dict = None, 83 **kwargs, 84 ) -> SearchResults | list[str | SearchResult]: 85 return self.search( 86 collection, query, n_results=sys.maxsize - 1, where=where, **kwargs 87 ) 88 89 @abstractmethod 90 def get_all(self, collection: str) -> SearchResults | list[str | SearchResult]: 91 """Return all documents in the collection""" 92 93 def save(self, collection: str, text: str, metadata: dict = None): 94 """Save a single document in the collection""" 95 self.save_many(collection, [(text, metadata)]) 96 97 @abstractmethod 98 def save_many(self, collection: str, items: list[tuple[str, dict] | str]): 99 """Save multiple documents in the collection""" 100 101 @abstractmethod 102 def clear(self, collection: str): 103 """Clear the collection""" 104 105 def find_one(self, collection: str, query: str | list) -> str | SearchResult | None: 106 """ 107 Find most similar document in the collection 108 109 Returns: 110 Most similar document or None if collection is empty 111 """ 112 return next(iter(self.search(collection, query, 1)), None) 113 114 @abstractmethod 115 def count(self, collection: str) -> int: 116 """ 117 Count the number of documents in the collection 118 119 Returns: 120 Number of documents in the collection 121 """ 122 123 @abstractmethod 124 def delete(self, collection: str, what: str | list[str] | dict): 125 """ 126 Delete documents from the collection 127 128 Args: 129 collection (str): collection name 130 what (str | list[str] | dict): id, list ids or metadata query 131 """
class
SearchResults(builtins.list):
12class SearchResults(list): 13 def fit_to_token_size( 14 self, 15 max_tokens: int, 16 for_model: str = None, 17 encoding: str | tiktoken.Encoding = None, 18 verbose=True 19 ): 20 from ..tokenizing import fit_to_token_size 21 records, removed = fit_to_token_size(self, max_tokens, for_model, encoding) 22 if verbose and len(records) < len(self): 23 logging.info( 24 "For fitting %d records to %d tokens, %d records was removed", 25 len(self), 26 max_tokens, 27 removed 28 ) 29 return SearchResults(list(records))
Built-in mutable sequence.
If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.
def
fit_to_token_size( self, max_tokens: int, for_model: str = None, encoding: str | tiktoken.core.Encoding = None, verbose=True):
13 def fit_to_token_size( 14 self, 15 max_tokens: int, 16 for_model: str = None, 17 encoding: str | tiktoken.Encoding = None, 18 verbose=True 19 ): 20 from ..tokenizing import fit_to_token_size 21 records, removed = fit_to_token_size(self, max_tokens, for_model, encoding) 22 if verbose and len(records) < len(self): 23 logging.info( 24 "For fitting %d records to %d tokens, %d records was removed", 25 len(self), 26 max_tokens, 27 removed 28 ) 29 return SearchResults(list(records))
Inherited Members
- builtins.list
- list
- clear
- copy
- append
- insert
- extend
- pop
- remove
- index
- count
- reverse
- sort
32class SearchResult(ExtendedString): 33 """ 34 String containing the search result with additional information in attributes 35 36 Attributes: 37 id (str): document (text) identifier in embedding database 38 distance (float): The distance between the query and the search result 39 metadata (dict): A dictionary containing document metadata 40 """ 41 42 id: str 43 distance: float 44 metadata: dict
String containing the search result with additional information in attributes
Attributes:
- id (str): document (text) identifier in embedding database
- distance (float): The distance between the query and the search result
- metadata (dict): A dictionary containing document metadata
SearchResult(string: str, attrs: dict = None)
55 def __new__(cls, string: str, attrs: dict = None): 56 """ 57 Allows string to have attributes. 58 """ 59 obj = str.__new__(cls, string) 60 if attrs: 61 for k, v in attrs.items(): 62 setattr(obj, k, v) 63 return obj
Allows string to have attributes.
Inherited Members
- builtins.str
- encode
- replace
- split
- rsplit
- join
- capitalize
- casefold
- title
- center
- count
- expandtabs
- find
- partition
- index
- ljust
- lower
- lstrip
- rfind
- rindex
- rjust
- rstrip
- rpartition
- splitlines
- strip
- swapcase
- translate
- upper
- startswith
- endswith
- removeprefix
- removesuffix
- isascii
- islower
- isupper
- istitle
- isspace
- isdecimal
- isdigit
- isnumeric
- isalpha
- isalnum
- isidentifier
- isprintable
- zfill
- format
- format_map
- maketrans
@dataclass
class
AbstractEmbeddingDB47@dataclass 48class AbstractEmbeddingDB(ABC): 49 """ 50 Base class for embedding databases 51 """ 52 53 @abstractmethod 54 def search( 55 self, 56 collection: str, 57 query: str | list, 58 n_results: int = 5, 59 where: dict = None, 60 **kwargs, 61 ) -> list[str | SearchResult]: 62 """ 63 Similarity search 64 65 Args: 66 collection (str): collection name 67 query (str | list): query string or list of query strings 68 n_results (int): number of results to return 69 where (dict): filter results by metadata 70 **kwargs: additional arguments 71 """ 72 73 def find(self, *args, **kwargs) -> SearchResults | list[str | SearchResult]: 74 """ 75 Alias for `search` 76 """ 77 return self.search(*args, **kwargs) 78 79 def find_all( 80 self, 81 collection: str, 82 query: str | list, 83 where: dict = None, 84 **kwargs, 85 ) -> SearchResults | list[str | SearchResult]: 86 return self.search( 87 collection, query, n_results=sys.maxsize - 1, where=where, **kwargs 88 ) 89 90 @abstractmethod 91 def get_all(self, collection: str) -> SearchResults | list[str | SearchResult]: 92 """Return all documents in the collection""" 93 94 def save(self, collection: str, text: str, metadata: dict = None): 95 """Save a single document in the collection""" 96 self.save_many(collection, [(text, metadata)]) 97 98 @abstractmethod 99 def save_many(self, collection: str, items: list[tuple[str, dict] | str]): 100 """Save multiple documents in the collection""" 101 102 @abstractmethod 103 def clear(self, collection: str): 104 """Clear the collection""" 105 106 def find_one(self, collection: str, query: str | list) -> str | SearchResult | None: 107 """ 108 Find most similar document in the collection 109 110 Returns: 111 Most similar document or None if collection is empty 112 """ 113 return next(iter(self.search(collection, query, 1)), None) 114 115 @abstractmethod 116 def count(self, collection: str) -> int: 117 """ 118 Count the number of documents in the collection 119 120 Returns: 121 Number of documents in the collection 122 """ 123 124 @abstractmethod 125 def delete(self, collection: str, what: str | list[str] | dict): 126 """ 127 Delete documents from the collection 128 129 Args: 130 collection (str): collection name 131 what (str | list[str] | dict): id, list ids or metadata query 132 """
Base class for embedding databases
@abstractmethod
def
search( self, collection: str, query: str | list, n_results: int = 5, where: dict = None, **kwargs) -> list[str | SearchResult]:
53 @abstractmethod 54 def search( 55 self, 56 collection: str, 57 query: str | list, 58 n_results: int = 5, 59 where: dict = None, 60 **kwargs, 61 ) -> list[str | SearchResult]: 62 """ 63 Similarity search 64 65 Args: 66 collection (str): collection name 67 query (str | list): query string or list of query strings 68 n_results (int): number of results to return 69 where (dict): filter results by metadata 70 **kwargs: additional arguments 71 """
Similarity search
Arguments:
- collection (str): collection name
- query (str | list): query string or list of query strings
- n_results (int): number of results to return
- where (dict): filter results by metadata
- **kwargs: additional arguments
73 def find(self, *args, **kwargs) -> SearchResults | list[str | SearchResult]: 74 """ 75 Alias for `search` 76 """ 77 return self.search(*args, **kwargs)
Alias for search
def
find_all( self, collection: str, query: str | list, where: dict = None, **kwargs) -> SearchResults | list[str | SearchResult]:
90 @abstractmethod 91 def get_all(self, collection: str) -> SearchResults | list[str | SearchResult]: 92 """Return all documents in the collection"""
Return all documents in the collection
def
save(self, collection: str, text: str, metadata: dict = None):
94 def save(self, collection: str, text: str, metadata: dict = None): 95 """Save a single document in the collection""" 96 self.save_many(collection, [(text, metadata)])
Save a single document in the collection
@abstractmethod
def
save_many(self, collection: str, items: list[tuple[str, dict] | str]):
98 @abstractmethod 99 def save_many(self, collection: str, items: list[tuple[str, dict] | str]): 100 """Save multiple documents in the collection"""
Save multiple documents in the collection
106 def find_one(self, collection: str, query: str | list) -> str | SearchResult | None: 107 """ 108 Find most similar document in the collection 109 110 Returns: 111 Most similar document or None if collection is empty 112 """ 113 return next(iter(self.search(collection, query, 1)), None)
Find most similar document in the collection
Returns:
Most similar document or None if collection is empty
@abstractmethod
def
count(self, collection: str) -> int:
115 @abstractmethod 116 def count(self, collection: str) -> int: 117 """ 118 Count the number of documents in the collection 119 120 Returns: 121 Number of documents in the collection 122 """
Count the number of documents in the collection
Returns:
Number of documents in the collection
@abstractmethod
def
delete(self, collection: str, what: str | list[str] | dict):
124 @abstractmethod 125 def delete(self, collection: str, what: str | list[str] | dict): 126 """ 127 Delete documents from the collection 128 129 Args: 130 collection (str): collection name 131 what (str | list[str] | dict): id, list ids or metadata query 132 """
Delete documents from the collection
Arguments:
- collection (str): collection name
- what (str | list[str] | dict): id, list ids or metadata query