File size: 1,529 Bytes
76cbdff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import streamlit as st
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_community.vectorstores.utils import DistanceStrategy


def load_bge_embeddings():
    model_name = "BAAI/bge-small-en-v1.5"
    model_kwargs = {"device": "cpu"}
    encode_kwargs = {"normalize_embeddings": True}
    emb_fn = HuggingFaceBgeEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
        query_instruction="Represent this question for searching relevant passages: ",
    )
    return emb_fn


def load_pinecone_vectorstore():
    emb_fn = load_bge_embeddings()
    vectorstore = PineconeVectorStore(
        embedding=emb_fn,
        text_key="text",
        distance_strategy=DistanceStrategy.COSINE,
        pinecone_api_key=st.secrets["pinecone_api_key"],
        index_name=st.secrets["pinecone_index_name"],
    )
    return vectorstore


def get_vectorstore_filter(ret_config: dict) -> dict:
    vs_filter = {}
    if ret_config["filter_legis_id"] != "":
        vs_filter["legis_id"] = ret_config["filter_legis_id"]
    if ret_config["filter_bioguide_id"] != "":
        vs_filter["sponsor_bioguide_id"] = ret_config["filter_bioguide_id"]
    vs_filter = {
        **vs_filter,
        "congress_num": {"$in": ret_config["filter_congress_nums"]},
    }
    vs_filter = {
        **vs_filter,
        "sponsor_party": {"$in": ret_config["filter_sponsor_parties"]},
    }
    return vs_filter