microcore.embedding_db

  1import logging
  2import sys
  3from abc import ABC, abstractmethod
  4from dataclasses import dataclass
  5
  6import tiktoken
  7
  8from ..utils import ExtendedString
  9
 10
 11class SearchResults(list):
 12    def fit_to_token_size(
 13            self,
 14            max_tokens: int,
 15            for_model: str = None,
 16            encoding: str | tiktoken.Encoding = None,
 17            verbose=True
 18    ):
 19        from ..tokenizing import fit_to_token_size
 20        records, removed = fit_to_token_size(self, max_tokens, for_model, encoding)
 21        if verbose and len(records) < len(self):
 22            logging.info(
 23                "For fitting %d records to %d tokens, %d records was removed",
 24                len(self),
 25                max_tokens,
 26                removed
 27            )
 28        return SearchResults(list(records))
 29
 30
 31class SearchResult(ExtendedString):
 32    """
 33    String containing the search result with additional information in attributes
 34
 35    Attributes:
 36        id (str): document (text) identifier in embedding database
 37        distance (float): The distance between the query and the search result
 38        metadata (dict): A dictionary containing document metadata
 39    """
 40
 41    id: str
 42    distance: float
 43    metadata: dict
 44
 45
 46@dataclass
 47class AbstractEmbeddingDB(ABC):
 48    """
 49    Base class for embedding databases
 50    """
 51
 52    @abstractmethod
 53    def search(
 54            self,
 55            collection: str,
 56            query: str | list,
 57            n_results: int = 5,
 58            where: dict = None,
 59            **kwargs,
 60    ) -> list[str | SearchResult]:
 61        """
 62        Similarity search
 63
 64        Args:
 65            collection (str): collection name
 66            query (str | list): query string or list of query strings
 67            n_results (int): number of results to return
 68            where (dict): filter results by metadata
 69            **kwargs: additional arguments
 70        """
 71
 72    def find(self, *args, **kwargs) -> SearchResults | list[str | SearchResult]:
 73        """
 74        Alias for `search`
 75        """
 76        return self.search(*args, **kwargs)
 77
 78    def find_all(
 79            self,
 80            collection: str,
 81            query: str | list,
 82            where: dict = None,
 83            **kwargs,
 84    ) -> SearchResults | list[str | SearchResult]:
 85        return self.search(
 86            collection, query, n_results=sys.maxsize - 1, where=where, **kwargs
 87        )
 88
 89    @abstractmethod
 90    def get_all(self, collection: str) -> SearchResults | list[str | SearchResult]:
 91        """Return all documents in the collection"""
 92
 93    def save(self, collection: str, text: str, metadata: dict = None):
 94        """Save a single document in the collection"""
 95        self.save_many(collection, [(text, metadata)])
 96
 97    @abstractmethod
 98    def save_many(self, collection: str, items: list[tuple[str, dict] | str]):
 99        """Save multiple documents in the collection"""
100
101    @abstractmethod
102    def clear(self, collection: str):
103        """Clear the collection"""
104
105    def find_one(self, collection: str, query: str | list) -> str | SearchResult | None:
106        """
107        Find most similar document in the collection
108
109        Returns:
110            Most similar document or None if collection is empty
111        """
112        return next(iter(self.search(collection, query, 1)), None)
113
114    @abstractmethod
115    def count(self, collection: str) -> int:
116        """
117        Count the number of documents in the collection
118
119        Returns:
120            Number of documents in the collection
121        """
122
123    @abstractmethod
124    def delete(self, collection: str, what: str | list[str] | dict):
125        """
126        Delete documents from the collection
127
128        Args:
129            collection (str): collection name
130            what (str | list[str] | dict): id, list ids or metadata query
131        """
class SearchResults(builtins.list):
12class SearchResults(list):
13    def fit_to_token_size(
14            self,
15            max_tokens: int,
16            for_model: str = None,
17            encoding: str | tiktoken.Encoding = None,
18            verbose=True
19    ):
20        from ..tokenizing import fit_to_token_size
21        records, removed = fit_to_token_size(self, max_tokens, for_model, encoding)
22        if verbose and len(records) < len(self):
23            logging.info(
24                "For fitting %d records to %d tokens, %d records was removed",
25                len(self),
26                max_tokens,
27                removed
28            )
29        return SearchResults(list(records))

Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.

def fit_to_token_size( self, max_tokens: int, for_model: str = None, encoding: str | tiktoken.core.Encoding = None, verbose=True):
13    def fit_to_token_size(
14            self,
15            max_tokens: int,
16            for_model: str = None,
17            encoding: str | tiktoken.Encoding = None,
18            verbose=True
19    ):
20        from ..tokenizing import fit_to_token_size
21        records, removed = fit_to_token_size(self, max_tokens, for_model, encoding)
22        if verbose and len(records) < len(self):
23            logging.info(
24                "For fitting %d records to %d tokens, %d records was removed",
25                len(self),
26                max_tokens,
27                removed
28            )
29        return SearchResults(list(records))
Inherited Members
builtins.list
list
clear
copy
append
insert
extend
pop
remove
index
count
reverse
sort
class SearchResult(microcore.utils.ExtendedString):
32class SearchResult(ExtendedString):
33    """
34    String containing the search result with additional information in attributes
35
36    Attributes:
37        id (str): document (text) identifier in embedding database
38        distance (float): The distance between the query and the search result
39        metadata (dict): A dictionary containing document metadata
40    """
41
42    id: str
43    distance: float
44    metadata: dict

String containing the search result with additional information in attributes

Attributes:
  • id (str): document (text) identifier in embedding database
  • distance (float): The distance between the query and the search result
  • metadata (dict): A dictionary containing document metadata
SearchResult(string: str, attrs: dict = None)
55    def __new__(cls, string: str, attrs: dict = None):
56        """
57        Allows string to have attributes.
58        """
59        obj = str.__new__(cls, string)
60        if attrs:
61            for k, v in attrs.items():
62                setattr(obj, k, v)
63        return obj

Allows string to have attributes.

id: str
distance: float
metadata: dict
Inherited Members
microcore.utils.ExtendedString
to_tokens
num_tokens
builtins.str
encode
replace
split
rsplit
join
capitalize
casefold
title
center
count
expandtabs
find
partition
index
ljust
lower
lstrip
rfind
rindex
rjust
rstrip
rpartition
splitlines
strip
swapcase
translate
upper
startswith
endswith
removeprefix
removesuffix
isascii
islower
isupper
istitle
isspace
isdecimal
isdigit
isnumeric
isalpha
isalnum
isidentifier
isprintable
zfill
format
format_map
maketrans
@dataclass
class AbstractEmbeddingDB(abc.ABC):
 47@dataclass
 48class AbstractEmbeddingDB(ABC):
 49    """
 50    Base class for embedding databases
 51    """
 52
 53    @abstractmethod
 54    def search(
 55            self,
 56            collection: str,
 57            query: str | list,
 58            n_results: int = 5,
 59            where: dict = None,
 60            **kwargs,
 61    ) -> list[str | SearchResult]:
 62        """
 63        Similarity search
 64
 65        Args:
 66            collection (str): collection name
 67            query (str | list): query string or list of query strings
 68            n_results (int): number of results to return
 69            where (dict): filter results by metadata
 70            **kwargs: additional arguments
 71        """
 72
 73    def find(self, *args, **kwargs) -> SearchResults | list[str | SearchResult]:
 74        """
 75        Alias for `search`
 76        """
 77        return self.search(*args, **kwargs)
 78
 79    def find_all(
 80            self,
 81            collection: str,
 82            query: str | list,
 83            where: dict = None,
 84            **kwargs,
 85    ) -> SearchResults | list[str | SearchResult]:
 86        return self.search(
 87            collection, query, n_results=sys.maxsize - 1, where=where, **kwargs
 88        )
 89
 90    @abstractmethod
 91    def get_all(self, collection: str) -> SearchResults | list[str | SearchResult]:
 92        """Return all documents in the collection"""
 93
 94    def save(self, collection: str, text: str, metadata: dict = None):
 95        """Save a single document in the collection"""
 96        self.save_many(collection, [(text, metadata)])
 97
 98    @abstractmethod
 99    def save_many(self, collection: str, items: list[tuple[str, dict] | str]):
100        """Save multiple documents in the collection"""
101
102    @abstractmethod
103    def clear(self, collection: str):
104        """Clear the collection"""
105
106    def find_one(self, collection: str, query: str | list) -> str | SearchResult | None:
107        """
108        Find most similar document in the collection
109
110        Returns:
111            Most similar document or None if collection is empty
112        """
113        return next(iter(self.search(collection, query, 1)), None)
114
115    @abstractmethod
116    def count(self, collection: str) -> int:
117        """
118        Count the number of documents in the collection
119
120        Returns:
121            Number of documents in the collection
122        """
123
124    @abstractmethod
125    def delete(self, collection: str, what: str | list[str] | dict):
126        """
127        Delete documents from the collection
128
129        Args:
130            collection (str): collection name
131            what (str | list[str] | dict): id, list ids or metadata query
132        """

Base class for embedding databases

@abstractmethod
def search( self, collection: str, query: str | list, n_results: int = 5, where: dict = None, **kwargs) -> list[str | SearchResult]:
53    @abstractmethod
54    def search(
55            self,
56            collection: str,
57            query: str | list,
58            n_results: int = 5,
59            where: dict = None,
60            **kwargs,
61    ) -> list[str | SearchResult]:
62        """
63        Similarity search
64
65        Args:
66            collection (str): collection name
67            query (str | list): query string or list of query strings
68            n_results (int): number of results to return
69            where (dict): filter results by metadata
70            **kwargs: additional arguments
71        """

Similarity search

Arguments:
  • collection (str): collection name
  • query (str | list): query string or list of query strings
  • n_results (int): number of results to return
  • where (dict): filter results by metadata
  • **kwargs: additional arguments
def find( self, *args, **kwargs) -> SearchResults | list[str | SearchResult]:
73    def find(self, *args, **kwargs) -> SearchResults | list[str | SearchResult]:
74        """
75        Alias for `search`
76        """
77        return self.search(*args, **kwargs)

Alias for search

def find_all( self, collection: str, query: str | list, where: dict = None, **kwargs) -> SearchResults | list[str | SearchResult]:
79    def find_all(
80            self,
81            collection: str,
82            query: str | list,
83            where: dict = None,
84            **kwargs,
85    ) -> SearchResults | list[str | SearchResult]:
86        return self.search(
87            collection, query, n_results=sys.maxsize - 1, where=where, **kwargs
88        )
@abstractmethod
def get_all( self, collection: str) -> SearchResults | list[str | SearchResult]:
90    @abstractmethod
91    def get_all(self, collection: str) -> SearchResults | list[str | SearchResult]:
92        """Return all documents in the collection"""

Return all documents in the collection

def save(self, collection: str, text: str, metadata: dict = None):
94    def save(self, collection: str, text: str, metadata: dict = None):
95        """Save a single document in the collection"""
96        self.save_many(collection, [(text, metadata)])

Save a single document in the collection

@abstractmethod
def save_many(self, collection: str, items: list[tuple[str, dict] | str]):
 98    @abstractmethod
 99    def save_many(self, collection: str, items: list[tuple[str, dict] | str]):
100        """Save multiple documents in the collection"""

Save multiple documents in the collection

@abstractmethod
def clear(self, collection: str):
102    @abstractmethod
103    def clear(self, collection: str):
104        """Clear the collection"""

Clear the collection

def find_one( self, collection: str, query: str | list) -> str | SearchResult | None:
106    def find_one(self, collection: str, query: str | list) -> str | SearchResult | None:
107        """
108        Find most similar document in the collection
109
110        Returns:
111            Most similar document or None if collection is empty
112        """
113        return next(iter(self.search(collection, query, 1)), None)

Find most similar document in the collection

Returns:

Most similar document or None if collection is empty

@abstractmethod
def count(self, collection: str) -> int:
115    @abstractmethod
116    def count(self, collection: str) -> int:
117        """
118        Count the number of documents in the collection
119
120        Returns:
121            Number of documents in the collection
122        """

Count the number of documents in the collection

Returns:

Number of documents in the collection

@abstractmethod
def delete(self, collection: str, what: str | list[str] | dict):
124    @abstractmethod
125    def delete(self, collection: str, what: str | list[str] | dict):
126        """
127        Delete documents from the collection
128
129        Args:
130            collection (str): collection name
131            what (str | list[str] | dict): id, list ids or metadata query
132        """

Delete documents from the collection

Arguments:
  • collection (str): collection name
  • what (str | list[str] | dict): id, list ids or metadata query