from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
from langdetect import detect
import numpy as np
import gradio as gr
import re
import torch


summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
vectorizer = TfidfVectorizer(stop_words='english')


def text_fetch(url, driver):
    try:
        print(url)
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'lxml')
        cookie_selectors = [
            'div.cookie-banner',
            'div.cookie-popup',
            'div.cookie-notice',
            'footer',
            'aside',
            'div#cookie-consent',
            'div#cookie-banner',
            'nav',
            'header']
        for selector in cookie_selectors:
            for element in soup.select(selector):
                element.decompose()
        para = soup.find_all('p')
        cleaned_text=''
        despose={'cookies','cookie','news','privacy',"\n", "verifying you are human"}
        for p in para:
            try:
                text=p.get_text()
                k=text
                k.lower()
                l=set(k)
                if l.intersection(despose):
                    continue
                cleaned_text+=' '+text
            except:
                continue
        print("ok")
        return cleaned_text
    except TimeoutException:
        return None
    except WebDriverException as e:
        return None
    except Exception as e:
        return None


def google_search(query, driver):
    try:
        driver.get("https://www.google.com/")
        try:
            print("google")
            search = driver.find_element(By.NAME, "q")
            search.send_keys(query)
            search.send_keys(Keys.RETURN)
        except NoSuchElementException as e:
            print(1,e)
            return []
        except TimeoutException:
            print(2)
            return []
        try:
            news_link = driver.find_element(By.LINK_TEXT, "News")
            news_link.click()
        except NoSuchElementException as e:
            print(3,e)
            return []
        try:
            html = driver.page_source
            soup = BeautifulSoup(html, 'lxml')
            anchors = soup.find_all('a', class_='WlydOe')

            # Click on "Next" for more results
            next_link = driver.find_element(By.LINK_TEXT, "Next")
            next_link.click()

            # Scrape the next page for more news links
            html = driver.page_source
            soup = BeautifulSoup(html, 'lxml')
            anchors.extend(soup.find_all('a', class_='WlydOe'))
        except (NoSuchElementException, TimeoutException) as e:
            print(4,e)
            return []
        link = []
        for anchor in anchors:
            href = anchor.get('href')
            if href:
                link.append(href)
        print(len(link))
        return link
    
    except Exception as e:
        print(5,e)
        return []

def clean_text(text):
    try:
        if detect(text) == 'en':
            cleaned_text = re.sub(r'[^a-zA-Z\s.,?!]', '', text)
            return cleaned_text
        else:
            return ''
    except Exception as e:
        return ''


def text_generation(query):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    # driver.set_page_load_timeout(10)
    print("driver started")
    links = google_search(query,driver)
    pair = {}
    i=1
    for url in links:
        print(i,end=' ')
        i+=1
        text = text_fetch(url,driver)
        if text:
            if url[-1]=='/':
                url=url[:-1]
            text=clean_text(text)
            pair[text] = url
    driver.quit()
    print("driver quit")
    print(len(pair),"pairs")
    return pair


def clustering(pairs,e=0.8):
    texts=[text for text in pairs]
    try:
        X = vectorizer.fit_transform(texts)
        
        cosine_sim_matrix = cosine_similarity(X)
        cosine_dist_matrix = 1 - cosine_sim_matrix
        cosine_dist_matrix = np.clip(cosine_dist_matrix, 0.0, 1.0)
        db = DBSCAN(metric="precomputed", eps=e, min_samples=2)
        labels = db.fit_predict(cosine_dist_matrix)
    except:
        return []
    best_l=-1
    best_cluster=None
    cluster={}
    for i, label in enumerate(labels):
        if label<0:
            continue
        if label not in cluster:
            cluster[label]=[[],[]]
        cluster[label][0].append(texts[i])
        cluster[label][1].append(pairs[texts[i]])
    best_l=len(cluster)
    best_cluster=cluster
    if len(cluster)<3:
        cluster=clustering(pairs,e-0.1)
    for i in cluster:
        cluster[i][0].sort(key = lambda x:len(x))
    if best_l>len(cluster):
        return best_cluster
    return cluster


def summarization(text):
    text=text.split()
    if len(text)<=250:
        return None
    text=' '.join(text[0:min(700,len(text))])
    summary = summarizer(text, max_length=250, min_length=100, do_sample=False)
    return summary[0]['summary_text']


def output(cluster):
    label=0
    out={}
    while label>-1:
        if label not in cluster:
            break
        summary = summarization(cluster[label][0][-1])
        if not summary:
            continue
        out[summary]=cluster[label][1]
        label+=1
    return out


def query(company,domain=''):
    query = company+' '+domain
    pairs = text_generation(query)
    print("clustering start")
    cluster = clustering(pairs)
    print(len(cluster),"clusters")
    print("summarization")
    result = output(cluster)
    print("Done")
    return result


def gradio_fun(name,domain=''):
    result = query(name,domain)
    out_str = f"\"{name.capitalize()}\" Report on \"{domain.capitalize()}\" domain.\n\n"
    i=1
    for text in result:
        out_str+=str(i)+". "+text+"\n\nSupporting URLs\n"
        i+=1
        for link in result[text]:
            out_str+=">>"+link+"\n"
        out_str+='\n\n'
    return out_str


with gr.Blocks(fill_height=True) as demo:
    title="Company Insight",
    description="Fill below information to see Insight."
    with gr.Row():
        with gr.Column():
            name = gr.Textbox(placeholder="Organization Name",label='Organization')
            domain = gr.Textbox(placeholder="Domain",label='Domain')
            submit = gr.Button(value="Submit")
        with gr.Column():
            result = gr.Textbox(label="Output",lines=28)
    
    submit.click(gradio_fun, inputs=[name,domain], outputs=result)
demo.launch()