Spaces:
Runtime error
Runtime error
File size: 2,978 Bytes
76003bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
from qdrant_client import models
from langchain_community.document_loaders.url import UnstructuredURLLoader
from langchain.text_splitter import CharacterTextSplitter
def urlload(urls):
links = urls.split(",")
if len(links) <= 5:
try:
loader = UnstructuredURLLoader(
urls=links, method="elements",
strategy="fast"
)
docs = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
pages = text_splitter.split_documents(docs)
contents = [{"text": pages[i].page_content, "url": pages[i].metadata["source"]} for i in range(len(pages))]
return contents
except Exception as e:
return f"An error occurred while parsing the URLs: {e}"
else:
return "Sorry, for space reasons we cannot grant the upload of more than 5 URLs :("
class NeuralSearcher:
def __init__(self, collection_name, client, model):
self.collection_name = collection_name
# Initialize encoder model
self.model = model
# initialize Qdrant client
self.qdrant_client = client
def search(self, text: str, num_return: int):
# Convert text query into vector
vector = self.model.encode(text).tolist()
# Use `vector` for search for closest vectors in the collection
search_result = self.qdrant_client.search(
collection_name=self.collection_name,
query_vector=vector,
query_filter=None, # If you don't want any filters for now
limit=num_return, # 5 the most closest results is enough
)
# `search_result` contains found vector ids with similarity scores along with the stored payload
# In this function you are interested in payload only
payloads = [hit.payload for hit in search_result]
return payloads
def upload_to_qdrant_collection(client, collection_name, encoder, documents):
client.upload_points(
collection_name=collection_name,
points=[
models.PointStruct(
id=idx, vector=encoder.encode(doc["text"]).tolist(), payload=doc
)
for idx, doc in enumerate(documents)
],
)
def upload_to_qdrant_subcollection(client, collection_name, encoder, documents):
client.delete_collection(collection_name=collection_name)
client.create_collection(collection_name = collection_name,
vectors_config=models.VectorParams(
size = encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
distance = models.Distance.COSINE,
),
)
client.upload_points(
collection_name=collection_name,
points=[
models.PointStruct(
id=idx, vector=encoder.encode(doc["text"]).tolist(), payload=doc
)
for idx, doc in enumerate(documents)
],
)
|