File size: 1,240 Bytes
8404ae6
37958d9
 
8404ae6
37958d9
 
 
 
 
 
 
8404ae6
0b8fc5d
37958d9
a42b85f
e44114f
37958d9
8404ae6
37958d9
c920449
 
8404ae6
37958d9
c920449
8404ae6
37958d9
 
8404ae6
04575d6
c920449
e6c3185
04575d6
7ecf31e
04575d6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import streamlit as st
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

def search(text, model, ds, n):
  encoded_text = model.encode(text)
  scores, retrieved_examples = ds.get_nearest_examples('embedding', encoded_text, k=n)
  matching_titles = retrieved_examples["title"]
  urls = retrieved_examples["link"]
  contents = retrieved_examples["content"]
  return list(zip(matching_titles, [c[:150] for c in contents], urls, scores))

@st.cache()
def get_dataset():
    ds = load_dataset("justinian336/salvadoran-news-embedded")["train"]
    ds.add_faiss_index(column="embedding")
    return ds

def get_model():
    if "model" not in st.session_state:
        st.session_state["model"] = SentenceTransformer("justinian336/chupeto")

ds = get_dataset()
get_model()

st.markdown("# Buscador de Noticias Salvadoreñas")
search_text = st.text_input(label="Búsqueda")

if search_text:
    search_results = search(search_text, st.session_state["model"], ds, 10)
    for title, content, url, _ in search_results:
        st.markdown(f"""<div><a href="{url}">{title}</a></div>""", unsafe_allow_html=True)
        st.markdown(f"""<div>{content}...</div>""", unsafe_allow_html=True)
        st.markdown("---")