import streamlit as st import wikipedia import wikipediaapi from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import normalize # Initialize Wikipedia API wiki_wiki = wikipediaapi.Wikipedia( language='en', user_agent="LSI1/1.0 (sanaa.2000.n@gmail.com)" ) # Function to fetch related Wikipedia articles def fetch_related_articles(query, max_articles=20): search_results = wikipedia.search(query, results=max_articles) articles = {} for title in search_results: page = wiki_wiki.page(title) if page.exists(): articles[title] = page.text return articles # Function to rank articles using SVD def rank_articles(query, vectorizer, svd, lsi_matrix, titles): query_tfidf = vectorizer.transform([query]) query_lsi = svd.transform(query_tfidf) query_lsi = normalize(query_lsi) similarities = cosine_similarity(query_lsi, lsi_matrix).flatten() ranked_indices = similarities.argsort()[::-1] # Sort by similarity (desc) return [(titles[idx], similarities[idx]) for idx in ranked_indices] # Streamlit UI st.title("Wikipedia Search with SVD") st.write("Enter a search query to fetch and rank Wikipedia articles.") # Input for query search_query = st.text_input("Search Wikipedia:") # Detect query change and clear session state if needed if "previous_query" not in st.session_state: st.session_state.previous_query = None if search_query and search_query != st.session_state.previous_query: # Reset session state variables st.session_state.previous_query = search_query st.session_state.ranked_results = None st.session_state.end_index = 10 if search_query: # Fetch articles dynamically with st.spinner("Fetching articles..."): articles = fetch_related_articles(search_query) if not articles: st.warning("No articles found! Try a different query.") else: # Prepare data for TF-IDF titles = list(articles.keys()) contents = list(articles.values()) # TF-IDF Vectorization vectorizer = TfidfVectorizer(max_features=5000, stop_words="english") tfidf_matrix = vectorizer.fit_transform(contents) # SVD Dimensionality Reduction n_components = 100 # Number of dimensions svd = TruncatedSVD(n_components=n_components) lsi_matrix = svd.fit_transform(tfidf_matrix) lsi_matrix = normalize(lsi_matrix) # Normalize for cosine similarity # Handle session state for pagination if st.session_state.ranked_results is None: st.session_state.ranked_results = rank_articles(search_query, vectorizer, svd, lsi_matrix, titles) # Display ranked results end_index = st.session_state.end_index ranked_results = st.session_state.ranked_results[:end_index] st.subheader("Search Results:") for title, similarity in ranked_results: st.markdown(f"### [{title}](https://en.wikipedia.org/wiki/{title.replace(' ', '_')})") st.write(f"**Similarity Score:** {similarity:.2f}") st.write("---") # Pagination controls if end_index < len(st.session_state.ranked_results): if st.button("Load More"): st.session_state.end_index += 10 else: st.info("No more articles to load.")