Spaces:
Sleeping
Sleeping
MatteoScript
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -15,8 +15,7 @@ from bs4 import BeautifulSoup
|
|
15 |
import PyPDF2
|
16 |
import pytesseract
|
17 |
from PIL import Image
|
18 |
-
|
19 |
-
|
20 |
|
21 |
load_dotenv()
|
22 |
URL_APP_SCRIPT = os.getenv('URL_APP_SCRIPT')
|
@@ -159,13 +158,13 @@ def sidebar():
|
|
159 |
st.markdown("---")
|
160 |
st.markdown("# Ricerca Online")
|
161 |
st.session_state.cerca_online = st.toggle("Attivata", value=False)
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
st.session_state.selected_tbs = st.selectbox("Periodo:", list(st.session_state.tbs_options.keys()), disabled=not st.session_state.cerca_online)
|
167 |
st.session_state.tbs_value = st.session_state.tbs_options[st.session_state.selected_tbs]
|
168 |
-
st.session_state.numero_siti = st.slider(label="Risultati", min_value = 1, max_value=20, value=3, disabled=not st.session_state.cerca_online)
|
169 |
#st.session_state.suddividi_ricerca = st.toggle("Attivata", value=False)
|
170 |
st.markdown("---")
|
171 |
|
@@ -264,21 +263,41 @@ def gen_augmented_prompt(prompt, top_k) :
|
|
264 |
links.append((reference, testo))
|
265 |
return context, links
|
266 |
|
267 |
-
def
|
268 |
-
|
269 |
-
|
270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
response = requests.get(url)
|
272 |
soup = BeautifulSoup(response.text, 'html.parser')
|
273 |
title = soup.title.string if soup.title else "N/A"
|
274 |
description = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else "N/A"
|
275 |
body_content = soup.find('body').get_text() if soup.find('body') else "N/A"
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
return results
|
281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
def gen_online_prompt(prompt, top_k) :
|
283 |
links = []
|
284 |
context = ''
|
|
|
15 |
import PyPDF2
|
16 |
import pytesseract
|
17 |
from PIL import Image
|
18 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
19 |
|
20 |
load_dotenv()
|
21 |
URL_APP_SCRIPT = os.getenv('URL_APP_SCRIPT')
|
|
|
158 |
st.markdown("---")
|
159 |
st.markdown("# Ricerca Online")
|
160 |
st.session_state.cerca_online = st.toggle("Attivata", value=False)
|
161 |
+
with st.popover("Siti Specifici", disabled=not st.session_state.cerca_online,use_container_width=True):
|
162 |
+
st.markdown("#### Inserisci Siti Web ")
|
163 |
+
for i in range(5):
|
164 |
+
st.session_state.urls[i] = st.text_input(f"URL Sito {i+1}", placeholder='Sito Web...', help='è possibile specificare anche il link di un video Youtube, in tal caso verrà restituita la trascrizione del video')
|
165 |
+
st.session_state.selected_tbs = st.selectbox("Periodo:", list(st.session_state.tbs_options.keys()), disabled=(not st.session_state.cerca_online) or (st.session_state.urls[0]!=""))
|
166 |
st.session_state.tbs_value = st.session_state.tbs_options[st.session_state.selected_tbs]
|
167 |
+
st.session_state.numero_siti = st.slider(label="Risultati", min_value = 1, max_value=20, value=3, disabled=(not st.session_state.cerca_online) or (st.session_state.urls[0]!=""))
|
168 |
#st.session_state.suddividi_ricerca = st.toggle("Attivata", value=False)
|
169 |
st.markdown("---")
|
170 |
|
|
|
263 |
links.append((reference, testo))
|
264 |
return context, links
|
265 |
|
266 |
+
def get_search_results_int(url):
|
267 |
+
result = {'title': '', 'description': '', 'url': '', 'body': ''}
|
268 |
+
try:
|
269 |
+
if "www.youtube.com" in url:
|
270 |
+
video_id = url.split("=")[1]
|
271 |
+
title = 'Video Youtube'
|
272 |
+
description = ''
|
273 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
274 |
+
body_content = " ".join([segment["text"] for segment in transcript])
|
275 |
+
print(video_id)
|
276 |
+
print(body_content)
|
277 |
+
result = {'title': title, 'description': body_content, 'url': url, 'body': body_content}
|
278 |
+
else:
|
279 |
response = requests.get(url)
|
280 |
soup = BeautifulSoup(response.text, 'html.parser')
|
281 |
title = soup.title.string if soup.title else "N/A"
|
282 |
description = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else "N/A"
|
283 |
body_content = soup.find('body').get_text() if soup.find('body') else "N/A"
|
284 |
+
result = {'title': title, 'description': description, 'url': url, 'body': body_content}
|
285 |
+
except Exception as e:
|
286 |
+
print(f"Error fetching data from {url}: {e}")
|
287 |
+
return result
|
|
|
288 |
|
289 |
+
def get_search_results(query, top_k):
|
290 |
+
results = []
|
291 |
+
if st.session_state.urls[0] != "":
|
292 |
+
for i in range(5):
|
293 |
+
url = st.session_state.urls[i]
|
294 |
+
if url != "":
|
295 |
+
results.append(get_search_results_int(url))
|
296 |
+
else:
|
297 |
+
for url in search(query, num=top_k, stop=top_k, tbs=st.session_state.tbs_value):
|
298 |
+
results.append(get_search_results_int(url))
|
299 |
+
return results
|
300 |
+
|
301 |
def gen_online_prompt(prompt, top_k) :
|
302 |
links = []
|
303 |
context = ''
|