File size: 2,632 Bytes
a2ee974
 
 
 
 
 
5782e66
a2ee974
 
 
 
 
 
5782e66
566bba1
 
 
 
 
 
5782e66
 
a2ee974
 
 
 
 
 
77b7045
566bba1
 
 
 
f5f34c2
 
 
 
566bba1
a2ee974
 
 
 
 
 
 
 
 
 
 
14ee6ff
 
 
 
 
 
 
a2ee974
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# Created by Leandro Carneiro at 19/01/2024
# Description: 
# ------------------------------------------------
import os.path
import time

from googleapiclient.discovery import build
import requests
from bs4 import BeautifulSoup

import constants


def google_search_api(search_term, api_key, cse_id, **kwargs):
    try:
        service = build("customsearch", "v1", developerKey=api_key)
        res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
        return res['items']
    except Exception as e:
        return -1

    
def search_google(subject, sites):
    try:
        results = []
        for site in sites:
            print('    Buscando notícias no domínio: ' + site)
            query = f"{subject} site:{site}"
            sites_searched = google_search_api(query, os.environ['GOOGLE_KEY'], os.environ['GOOGLE_SEARCH'], num=constants.num_sites)
            if sites_searched == -1:
                results.append(site)
            else:
                for s in sites_searched:
                    if 'pdf' not in s['link'].lower():
                        results.append(s['link'])
                    else:
                        print('    Arquivo PDF encontrado: ' + s['link'])
                    #time.sleep(3)
        print('    Total de sites encontrados: ' + str(len(results)))

        return results
    except Exception as e:
        print(str(e))
        return str(e)

def retrieve_text_from_site(sites):
        result = []
        for site in sites:
            print('    Baixando texto do site: ' + site)
            try:
                response = requests.get(site)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')
                result.append(soup.get_text())
            except Exception as e:
                result.append('Erro na recuperação do texto: ' + str(e))
        return result

def delete_base(local_base):
    try:
        for i in os.listdir(local_base):
            file_path = os.path.join(local_base, i)
            os.remove(file_path)
        return 0
    except Exception as e:
        return str(e)

def save_on_base(sites, texts, local_base):
    try:
        for i in range(len(sites)):
            filename = f'news{i}.txt'
            with open(os.path.join(local_base, filename), 'w', encoding='utf-8') as file:
                file.write(texts[i])
            with open(os.path.join(local_base, 'filename_url.csv'), 'a', encoding='utf-8') as file:
                file.write(filename + ';' + sites[i] + '\n')

        return 0
    except Exception as e:
        return str(e)