Spaces:
Runtime error
Runtime error
"""Wrapper around Pinecone vector database.""" | |
from __future__ import annotations | |
import uuid | |
from typing import Any, Callable, Iterable, List, Optional, Tuple | |
from langchain.docstore.document import Document | |
from langchain.embeddings.base import Embeddings | |
from langchain.vectorstores.base import VectorStore | |
class Pinecone(VectorStore): | |
"""Wrapper around Pinecone vector database. | |
To use, you should have the ``pinecone-client`` python package installed. | |
Example: | |
.. code-block:: python | |
from langchain.vectorstores import Pinecone | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
import pinecone | |
pinecone.init(api_key="***", environment="us-west1-gcp") | |
index = pinecone.Index("langchain-demo") | |
embeddings = OpenAIEmbeddings() | |
vectorstore = Pinecone(index, embeddings.embed_query, "text") | |
""" | |
def __init__( | |
self, | |
index: Any, | |
embedding_function: Callable, | |
text_key: str, | |
namespace: Optional[str] = None, | |
): | |
"""Initialize with Pinecone client.""" | |
try: | |
import pinecone | |
except ImportError: | |
raise ValueError( | |
"Could not import pinecone python package. " | |
"Please install it with `pip install pinecone-client`." | |
) | |
if not isinstance(index, pinecone.index.Index): | |
raise ValueError( | |
f"client should be an instance of pinecone.index.Index, " | |
f"got {type(index)}" | |
) | |
self._index = index | |
self._embedding_function = embedding_function | |
self._text_key = text_key | |
self._namespace = namespace | |
def add_texts( | |
self, | |
texts: Iterable[str], | |
metadatas: Optional[List[dict]] = None, | |
ids: Optional[List[str]] = None, | |
namespace: Optional[str] = None, | |
batch_size: int = 32, | |
**kwargs: Any, | |
) -> List[str]: | |
"""Run more texts through the embeddings and add to the vectorstore. | |
Args: | |
texts: Iterable of strings to add to the vectorstore. | |
metadatas: Optional list of metadatas associated with the texts. | |
ids: Optional list of ids to associate with the texts. | |
namespace: Optional pinecone namespace to add the texts to. | |
Returns: | |
List of ids from adding the texts into the vectorstore. | |
""" | |
if namespace is None: | |
namespace = self._namespace | |
# Embed and create the documents | |
docs = [] | |
ids = ids or [str(uuid.uuid4()) for _ in texts] | |
for i, text in enumerate(texts): | |
embedding = self._embedding_function(text) | |
metadata = metadatas[i] if metadatas else {} | |
metadata[self._text_key] = text | |
docs.append((ids[i], embedding, metadata)) | |
# upsert to Pinecone | |
self._index.upsert(vectors=docs, namespace=namespace, batch_size=batch_size) | |
return ids | |
def similarity_search_with_score( | |
self, | |
query: str, | |
k: int = 5, | |
filter: Optional[dict] = None, | |
namespace: Optional[str] = None, | |
) -> List[Tuple[Document, float]]: | |
"""Return pinecone documents most similar to query, along with scores. | |
Args: | |
query: Text to look up documents similar to. | |
k: Number of Documents to return. Defaults to 4. | |
filter: Dictionary of argument(s) to filter on metadata | |
namespace: Namespace to search in. Default will search in '' namespace. | |
Returns: | |
List of Documents most similar to the query and score for each | |
""" | |
if namespace is None: | |
namespace = self._namespace | |
query_obj = self._embedding_function(query) | |
docs = [] | |
results = self._index.query( | |
[query_obj], | |
top_k=k, | |
include_metadata=True, | |
namespace=namespace, | |
filter=filter, | |
) | |
for res in results["matches"]: | |
metadata = res["metadata"] | |
text = metadata.pop(self._text_key) | |
docs.append((Document(page_content=text, metadata=metadata), res["score"])) | |
return docs | |
def similarity_search( | |
self, | |
query: str, | |
k: int = 5, | |
filter: Optional[dict] = None, | |
namespace: Optional[str] = None, | |
**kwargs: Any, | |
) -> List[Document]: | |
"""Return pinecone documents most similar to query. | |
Args: | |
query: Text to look up documents similar to. | |
k: Number of Documents to return. Defaults to 4. | |
filter: Dictionary of argument(s) to filter on metadata | |
namespace: Namespace to search in. Default will search in '' namespace. | |
Returns: | |
List of Documents most similar to the query and score for each | |
""" | |
if namespace is None: | |
namespace = self._namespace | |
query_obj = self._embedding_function(query) | |
docs = [] | |
results = self._index.query( | |
[query_obj], | |
top_k=k, | |
include_metadata=True, | |
namespace=namespace, | |
filter=filter, | |
) | |
for res in results["matches"]: | |
metadata = res["metadata"] | |
text = metadata.pop(self._text_key) | |
docs.append(Document(page_content=text, metadata=metadata)) | |
return docs | |
def from_texts( | |
cls, | |
texts: List[str], | |
embedding: Embeddings, | |
metadatas: Optional[List[dict]] = None, | |
ids: Optional[List[str]] = None, | |
batch_size: int = 32, | |
text_key: str = "text", | |
index_name: Optional[str] = None, | |
namespace: Optional[str] = None, | |
**kwargs: Any, | |
) -> Pinecone: | |
"""Construct Pinecone wrapper from raw documents. | |
This is a user friendly interface that: | |
1. Embeds documents. | |
2. Adds the documents to a provided Pinecone index | |
This is intended to be a quick way to get started. | |
Example: | |
.. code-block:: python | |
from langchain import Pinecone | |
from langchain.embeddings import OpenAIEmbeddings | |
embeddings = OpenAIEmbeddings() | |
pinecone = Pinecone.from_texts( | |
texts, | |
embeddings, | |
index_name="langchain-demo" | |
) | |
""" | |
try: | |
import pinecone | |
except ImportError: | |
raise ValueError( | |
"Could not import pinecone python package. " | |
"Please install it with `pip install pinecone-client`." | |
) | |
_index_name = index_name or str(uuid.uuid4()) | |
indexes = pinecone.list_indexes() # checks if provided index exists | |
if _index_name in indexes: | |
index = pinecone.Index(_index_name) | |
else: | |
index = None | |
for i in range(0, len(texts), batch_size): | |
# set end position of batch | |
i_end = min(i + batch_size, len(texts)) | |
# get batch of texts and ids | |
lines_batch = texts[i:i_end] | |
# create ids if not provided | |
if ids: | |
ids_batch = ids[i:i_end] | |
else: | |
ids_batch = [str(uuid.uuid4()) for n in range(i, i_end)] | |
# create embeddings | |
embeds = embedding.embed_documents(lines_batch) | |
# prep metadata and upsert batch | |
if metadatas: | |
metadata = metadatas[i:i_end] | |
else: | |
metadata = [{} for _ in range(i, i_end)] | |
for j, line in enumerate(lines_batch): | |
metadata[j][text_key] = line | |
to_upsert = zip(ids_batch, embeds, metadata) | |
# Create index if it does not exist | |
if index is None: | |
pinecone.create_index(_index_name, dimension=len(embeds[0])) | |
index = pinecone.Index(_index_name) | |
# upsert to Pinecone | |
index.upsert(vectors=list(to_upsert), namespace=namespace) | |
return cls(index, embedding.embed_query, text_key, namespace) | |
def from_existing_index( | |
cls, | |
index_name: str, | |
embedding: Embeddings, | |
text_key: str = "text", | |
namespace: Optional[str] = None, | |
) -> Pinecone: | |
"""Load pinecone vectorstore from index name.""" | |
try: | |
import pinecone | |
except ImportError: | |
raise ValueError( | |
"Could not import pinecone python package. " | |
"Please install it with `pip install pinecone-client`." | |
) | |
return cls( | |
pinecone.Index(index_name), embedding.embed_query, text_key, namespace | |
) | |