MatteoScript commited on
Commit
7376a17
·
verified ·
1 Parent(s): f7d650c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -17
app.py CHANGED
@@ -15,8 +15,7 @@ from bs4 import BeautifulSoup
15
  import PyPDF2
16
  import pytesseract
17
  from PIL import Image
18
-
19
-
20
 
21
  load_dotenv()
22
  URL_APP_SCRIPT = os.getenv('URL_APP_SCRIPT')
@@ -159,13 +158,13 @@ def sidebar():
159
  st.markdown("---")
160
  st.markdown("# Ricerca Online")
161
  st.session_state.cerca_online = st.toggle("Attivata", value=False)
162
- #with st.popover("Siti Specifici", disabled=not st.session_state.cerca_online,use_container_width=True):
163
- # st.markdown("#### Inserisci Siti Web ")
164
- # for i in range(5):
165
- # st.session_state.urls[i] = st.text_input("", placeholder=f"URL Sito {i+1}")
166
- st.session_state.selected_tbs = st.selectbox("Periodo:", list(st.session_state.tbs_options.keys()), disabled=not st.session_state.cerca_online)
167
  st.session_state.tbs_value = st.session_state.tbs_options[st.session_state.selected_tbs]
168
- st.session_state.numero_siti = st.slider(label="Risultati", min_value = 1, max_value=20, value=3, disabled=not st.session_state.cerca_online)
169
  #st.session_state.suddividi_ricerca = st.toggle("Attivata", value=False)
170
  st.markdown("---")
171
 
@@ -264,21 +263,41 @@ def gen_augmented_prompt(prompt, top_k) :
264
  links.append((reference, testo))
265
  return context, links
266
 
267
- def get_search_results(query, top_k):
268
- results = []
269
- for url in search(query, num=top_k, stop=top_k, tbs=st.session_state.tbs_value):
270
- try:
 
 
 
 
 
 
 
 
 
271
  response = requests.get(url)
272
  soup = BeautifulSoup(response.text, 'html.parser')
273
  title = soup.title.string if soup.title else "N/A"
274
  description = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else "N/A"
275
  body_content = soup.find('body').get_text() if soup.find('body') else "N/A"
276
- results.append({'title': title, 'description': description, 'url': url, 'body': body_content})
277
- except Exception as e:
278
- print(f"Error fetching data from {url}: {e}")
279
- continue
280
- return results
281
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  def gen_online_prompt(prompt, top_k) :
283
  links = []
284
  context = ''
 
15
  import PyPDF2
16
  import pytesseract
17
  from PIL import Image
18
+ from youtube_transcript_api import YouTubeTranscriptApi
 
19
 
20
  load_dotenv()
21
  URL_APP_SCRIPT = os.getenv('URL_APP_SCRIPT')
 
158
  st.markdown("---")
159
  st.markdown("# Ricerca Online")
160
  st.session_state.cerca_online = st.toggle("Attivata", value=False)
161
+ with st.popover("Siti Specifici", disabled=not st.session_state.cerca_online,use_container_width=True):
162
+ st.markdown("#### Inserisci Siti Web ")
163
+ for i in range(5):
164
+ st.session_state.urls[i] = st.text_input(f"URL Sito {i+1}", placeholder='Sito Web...', help='è possibile specificare anche il link di un video Youtube, in tal caso verrà restituita la trascrizione del video')
165
+ st.session_state.selected_tbs = st.selectbox("Periodo:", list(st.session_state.tbs_options.keys()), disabled=(not st.session_state.cerca_online) or (st.session_state.urls[0]!=""))
166
  st.session_state.tbs_value = st.session_state.tbs_options[st.session_state.selected_tbs]
167
+ st.session_state.numero_siti = st.slider(label="Risultati", min_value = 1, max_value=20, value=3, disabled=(not st.session_state.cerca_online) or (st.session_state.urls[0]!=""))
168
  #st.session_state.suddividi_ricerca = st.toggle("Attivata", value=False)
169
  st.markdown("---")
170
 
 
263
  links.append((reference, testo))
264
  return context, links
265
 
266
+ def get_search_results_int(url):
267
+ result = {'title': '', 'description': '', 'url': '', 'body': ''}
268
+ try:
269
+ if "www.youtube.com" in url:
270
+ video_id = url.split("=")[1]
271
+ title = 'Video Youtube'
272
+ description = ''
273
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
274
+ body_content = " ".join([segment["text"] for segment in transcript])
275
+ print(video_id)
276
+ print(body_content)
277
+ result = {'title': title, 'description': body_content, 'url': url, 'body': body_content}
278
+ else:
279
  response = requests.get(url)
280
  soup = BeautifulSoup(response.text, 'html.parser')
281
  title = soup.title.string if soup.title else "N/A"
282
  description = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else "N/A"
283
  body_content = soup.find('body').get_text() if soup.find('body') else "N/A"
284
+ result = {'title': title, 'description': description, 'url': url, 'body': body_content}
285
+ except Exception as e:
286
+ print(f"Error fetching data from {url}: {e}")
287
+ return result
 
288
 
289
+ def get_search_results(query, top_k):
290
+ results = []
291
+ if st.session_state.urls[0] != "":
292
+ for i in range(5):
293
+ url = st.session_state.urls[i]
294
+ if url != "":
295
+ results.append(get_search_results_int(url))
296
+ else:
297
+ for url in search(query, num=top_k, stop=top_k, tbs=st.session_state.tbs_value):
298
+ results.append(get_search_results_int(url))
299
+ return results
300
+
301
  def gen_online_prompt(prompt, top_k) :
302
  links = []
303
  context = ''