Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

deeplake fixes #2

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 68 additions & 5 deletions libs/langchain/langchain/vectorstores/deeplake.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
try:
import deeplake
from deeplake.core.fast_forwarding import version_compare
from deeplake.core.vectorstore import DeepLakeVectorStore
from deeplake import VectorStore as DeepLakeVectorStore

_DEEPLAKE_INSTALLED = True
except ImportError:
Expand Down Expand Up @@ -63,6 +63,7 @@ def __init__(
verbose: bool = True,
exec_option: Optional[str] = None,
runtime: Optional[Dict] = None,
index_params: Optional[Dict[str, Union[int, str]]] = None,
**kwargs: Any,
) -> None:
"""Creates an empty DeepLakeVectorStore or loads an existing one.
Expand Down Expand Up @@ -119,6 +120,23 @@ def __init__(
Deep Lake's Managed Tensor Database. Not applicable when loading an
existing Vector Store. To create a Vector Store in the Managed Tensor
Database, set `runtime = {"tensor_db": True}`.
index_params (Optional[Dict[str, Union[int, str]]], optional): Dictionary
containing information about vector index that will be created. Defaults
to None, which will utilize ``DEFAULT_VECTORSTORE_INDEX_PARAMS`` from
``deeplake.constants``. The specified key-values override the default
ones.
- threshold: The threshold for the dataset size above which an index
will be created for the embedding tensor. When the threshold value
is set to -1, index creation is turned off. Defaults to -1, which
turns off the index.
- distance_metric: This key specifies the method of calculating the
distance between vectors when creating the vector database (VDB)
index. It can either be a string that corresponds to a member of
the DistanceType enumeration, or the string value itself.
- If no value is provided, it defaults to "L2".
- "L2" corresponds to DistanceType.L2_NORM.
- "COS" corresponds to DistanceType.COSINE_SIMILARITY.
- additional_params: Additional parameters for fine-tuning the index.
**kwargs: Other optional keyword arguments.

Raises:
Expand Down Expand Up @@ -161,6 +179,7 @@ def __init__(
exec_option=exec_option,
verbose=verbose,
runtime=runtime,
index_params=index_params,
**kwargs,
)

Expand Down Expand Up @@ -295,12 +314,13 @@ def _search(
embedding: Optional[Union[List[float], np.ndarray]] = None,
embedding_function: Optional[Callable] = None,
k: int = 4,
distance_metric: str = "L2",
distance_metric: Optional[str] = None,
use_maximal_marginal_relevance: bool = False,
fetch_k: Optional[int] = 20,
filter: Optional[Union[Dict, Callable]] = None,
return_score: bool = False,
exec_option: Optional[str] = None,
deep_memory: bool = False,
**kwargs: Any,
) -> Any[List[Document], List[Tuple[Document, float]]]:
"""
Expand All @@ -312,9 +332,9 @@ def _search(
embedding_function (Callable, optional): Function to convert `query`
into embedding.
k (int): Number of Documents to return.
distance_metric (str): `L2` for Euclidean, `L1` for Nuclear, `max`
for L-infinity distance, `cos` for cosine similarity, 'dot' for dot
product.
distance_metric (Optional[str], optional): `L2` for Euclidean, `L1` for
Nuclear, `max` for L-infinity distance, `cos` for cosine similarity,
'dot' for dot product.
filter (Union[Dict, Callable], optional): Additional filter prior
to the embedding search.
- ``Dict`` - Key-value search on tensors of htype json, on an
Expand All @@ -334,6 +354,13 @@ def _search(
- ``tensor_db`` - Hosted Managed Tensor Database for storage
and query execution. Only for data in Deep Lake Managed Database.
Use runtime = {"db_engine": True} during dataset creation.
deep_memory (bool): Whether to use the Deep Memory model for improving
search results. Defaults to False if deep_memory is not specified in
the Vector Store initialization. If True, the distance metric is set
to "deepmemory_distance", which represents the metric with which the
model was trained. The search is performed using the Deep Memory model.
If False, the distance metric is set to "COS" or whatever distance
metric user specifies.
**kwargs: Additional keyword arguments.

Returns:
Expand Down Expand Up @@ -387,6 +414,7 @@ def _search(
filter=filter,
exec_option=exec_option,
return_tensors=["embedding", "metadata", "text"],
deep_memory=deep_memory,
)

scores = result["score"]
Expand Down Expand Up @@ -467,6 +495,13 @@ def similarity_search(
- 'tensor_db': Managed Tensor Database for storage and query.
Only for data in Deep Lake Managed Database.
Use `runtime = {"db_engine": True}` during dataset creation.
deep_memory (bool): Whether to use the Deep Memory model for improving
search results. Defaults to False if deep_memory is not specified
in the Vector Store initialization. If True, the distance metric
is set to "deepmemory_distance", which represents the metric with
which the model was trained. The search is performed using the Deep
Memory model. If False, the distance metric is set to "COS" or
whatever distance metric user specifies.

Returns:
List[Document]: List of Documents most similar to the query vector.
Expand Down Expand Up @@ -530,6 +565,13 @@ def similarity_search_by_vector(
distance_metric (str): `L2` for Euclidean, `L1` for Nuclear,
`max` for L-infinity distance, `cos` for cosine similarity,
'dot' for dot product. Defaults to `L2`.
deep_memory (bool): Whether to use the Deep Memory model for improving
search results. Defaults to False if deep_memory is not specified
in the Vector Store initialization. If True, the distance metric
is set to "deepmemory_distance", which represents the metric with
which the model was trained. The search is performed using the Deep
Memory model. If False, the distance metric is set to "COS" or
whatever distance metric user specifies.

Returns:
List[Document]: List of Documents most similar to the query vector.
Expand Down Expand Up @@ -586,6 +628,13 @@ def similarity_search_with_score(
data stored in the Deep Lake Managed Database. To store datasets
in this database, specify `runtime = {"db_engine": True}`
during dataset creation.
deep_memory (bool): Whether to use the Deep Memory model for improving
search results. Defaults to False if deep_memory is not specified
in the Vector Store initialization. If True, the distance metric
is set to "deepmemory_distance", which represents the metric with
which the model was trained. The search is performed using the Deep
Memory model. If False, the distance metric is set to "COS" or
whatever distance metric user specifies.

Returns:
List[Tuple[Document, float]]: List of documents most similar to the query
Expand Down Expand Up @@ -641,6 +690,13 @@ def max_marginal_relevance_search_by_vector(
data stored in the Deep Lake Managed Database. To store datasets
in this database, specify `runtime = {"db_engine": True}`
during dataset creation.
deep_memory (bool): Whether to use the Deep Memory model for improving
search results. Defaults to False if deep_memory is not specified
in the Vector Store initialization. If True, the distance metric
is set to "deepmemory_distance", which represents the metric with
which the model was trained. The search is performed using the Deep
Memory model. If False, the distance metric is set to "COS" or
whatever distance metric user specifies.
**kwargs: Additional keyword arguments.

Returns:
Expand Down Expand Up @@ -701,6 +757,13 @@ def max_marginal_relevance_search(
for data stored in the Deep Lake Managed Database. To store
datasets in this database, specify
`runtime = {"db_engine": True}` during dataset creation.
deep_memory (bool): Whether to use the Deep Memory model for improving
search results. Defaults to False if deep_memory is not specified
in the Vector Store initialization. If True, the distance metric
is set to "deepmemory_distance", which represents the metric with
which the model was trained. The search is performed using the Deep
Memory model. If False, the distance metric is set to "COS" or
whatever distance metric user specifies.
**kwargs: Additional keyword arguments

Returns:
Expand Down
4 changes: 1 addition & 3 deletions libs/langchain/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ arxiv = {version = "^1.4", optional = true}
pypdf = {version = "^3.4.0", optional = true}
networkx = {version=">=2.6.3, <4", optional = true}
aleph-alpha-client = {version="^2.15.0", optional = true}
deeplake = {version = "^3.6.8", optional = true}
libdeeplake = {version = "^0.0.60", optional = true}
deeplake = {version = "^3.8.3", optional = true}
pgvector = {version = "^0.1.6", optional = true}
psycopg2-binary = {version = "^2.9.5", optional = true}
pyowm = {version = "^3.3.0", optional = true}
Expand Down Expand Up @@ -265,7 +264,6 @@ all = [
"nomic",
"aleph-alpha-client",
"deeplake",
"libdeeplake",
"pgvector",
"psycopg2-binary",
"pyowm",
Expand Down
Loading
Loading