from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import DBSCAN from sklearn.metrics.pairwise import cosine_similarity from transformers import pipeline from langdetect import detect import numpy as np import gradio as gr import re import torch summarizer = pipeline("summarization", model="facebook/bart-large-cnn") vectorizer = TfidfVectorizer(stop_words='english') def text_fetch(url, driver): try: print(url) driver.get(url) soup = BeautifulSoup(driver.page_source, 'lxml') cookie_selectors = [ 'div.cookie-banner', 'div.cookie-popup', 'div.cookie-notice', 'footer', 'aside', 'div#cookie-consent', 'div#cookie-banner', 'nav', 'header'] for selector in cookie_selectors: for element in soup.select(selector): element.decompose() para = soup.find_all('p') cleaned_text='' despose={'cookies','cookie','news','privacy',"\n", "verifying you are human"} for p in para: try: text=p.get_text() k=text k.lower() l=set(k) if l.intersection(despose): continue cleaned_text+=' '+text except: continue print("ok") return cleaned_text except TimeoutException: return None except WebDriverException as e: return None except Exception as e: return None def google_search(query, driver): try: driver.get("https://www.google.com/") try: print("google") search = driver.find_element(By.NAME, "q") search.send_keys(query) search.send_keys(Keys.RETURN) except NoSuchElementException as e: print(1,e) return [] except TimeoutException: print(2) return [] try: news_link = driver.find_element(By.LINK_TEXT, "News") news_link.click() except NoSuchElementException as e: print(3,e) return [] try: html = driver.page_source soup = BeautifulSoup(html, 'lxml') anchors = soup.find_all('a', class_='WlydOe') # Click on "Next" for more results next_link = driver.find_element(By.LINK_TEXT, "Next") next_link.click() # Scrape the next page for more news links html = driver.page_source soup = BeautifulSoup(html, 'lxml') anchors.extend(soup.find_all('a', class_='WlydOe')) except (NoSuchElementException, TimeoutException) as e: print(4,e) return [] link = [] for anchor in anchors: href = anchor.get('href') if href: link.append(href) print(len(link)) return link except Exception as e: print(5,e) return [] def clean_text(text): try: if detect(text) == 'en': cleaned_text = re.sub(r'[^a-zA-Z\s.,?!]', '', text) return cleaned_text else: return '' except Exception as e: return '' def text_generation(query): options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') driver = webdriver.Chrome(options=options) # driver.set_page_load_timeout(10) print("driver started") links = google_search(query,driver) pair = {} i=1 for url in links: print(i,end=' ') i+=1 text = text_fetch(url,driver) if text: if url[-1]=='/': url=url[:-1] text=clean_text(text) pair[text] = url driver.quit() print("driver quit") print(len(pair),"pairs") return pair def clustering(pairs,e=0.8): texts=[text for text in pairs] try: X = vectorizer.fit_transform(texts) cosine_sim_matrix = cosine_similarity(X) cosine_dist_matrix = 1 - cosine_sim_matrix cosine_dist_matrix = np.clip(cosine_dist_matrix, 0.0, 1.0) db = DBSCAN(metric="precomputed", eps=e, min_samples=2) labels = db.fit_predict(cosine_dist_matrix) except: return [] best_l=-1 best_cluster=None cluster={} for i, label in enumerate(labels): if label<0: continue if label not in cluster: cluster[label]=[[],[]] cluster[label][0].append(texts[i]) cluster[label][1].append(pairs[texts[i]]) best_l=len(cluster) best_cluster=cluster if len(cluster)<3: cluster=clustering(pairs,e-0.1) for i in cluster: cluster[i][0].sort(key = lambda x:len(x)) if best_l>len(cluster): return best_cluster return cluster def summarization(text): text=text.split() if len(text)<=250: return None text=' '.join(text[0:min(700,len(text))]) summary = summarizer(text, max_length=250, min_length=100, do_sample=False) return summary[0]['summary_text'] def output(cluster): label=0 out={} while label>-1: if label not in cluster: break summary = summarization(cluster[label][0][-1]) if not summary: continue out[summary]=cluster[label][1] label+=1 return out def query(company,domain=''): query = company+' '+domain pairs = text_generation(query) print("clustering start") cluster = clustering(pairs) print(len(cluster),"clusters") print("summarization") result = output(cluster) print("Done") return result def gradio_fun(name,domain=''): result = query(name,domain) out_str = f"\"{name.capitalize()}\" Report on \"{domain.capitalize()}\" domain.\n\n" i=1 for text in result: out_str+=str(i)+". "+text+"\n\nSupporting URLs\n" i+=1 for link in result[text]: out_str+=">>"+link+"\n" out_str+='\n\n' return out_str with gr.Blocks(fill_height=True) as demo: title="Company Insight", description="Fill below information to see Insight." with gr.Row(): with gr.Column(): name = gr.Textbox(placeholder="Organization Name",label='Organization') domain = gr.Textbox(placeholder="Domain",label='Domain') submit = gr.Button(value="Submit") with gr.Column(): result = gr.Textbox(label="Output",lines=28) submit.click(gradio_fun, inputs=[name,domain], outputs=result) demo.launch()