Spaces:
Sleeping
Sleeping
SoulofSukuna
commited on
Update helper/html_scraper.py
Browse files- helper/html_scraper.py +9 -13
helper/html_scraper.py
CHANGED
@@ -7,7 +7,6 @@ from constants.headers import HEADER_AIO
|
|
7 |
|
8 |
HTTP_PROXY = os.environ.get("HTTP_PROXY", None)
|
9 |
|
10 |
-
|
11 |
class Scraper:
|
12 |
@decorator_asyncio_fix
|
13 |
async def _get_html(self, session, url):
|
@@ -21,23 +20,20 @@ class Scraper:
|
|
21 |
return await asyncio.gather(asyncio.create_task(self._get_html(session, url)))
|
22 |
|
23 |
|
24 |
-
class
|
25 |
def __init__(self):
|
26 |
self.scraper = cloudscraper.create_scraper()
|
27 |
|
28 |
-
|
29 |
-
async def _get_html(self, url):
|
30 |
-
loop = asyncio.get_running_loop()
|
31 |
-
return await loop.run_in_executor(None, self._sync_scrape, url)
|
32 |
-
|
33 |
-
def _sync_scrape(self, url):
|
34 |
try:
|
35 |
-
response = self.scraper.get(url, headers=HEADER_AIO)
|
|
|
36 |
return response.text
|
37 |
except Exception as e:
|
38 |
-
print(f"Error
|
39 |
return None
|
40 |
|
41 |
-
async def get_all_results(self,
|
42 |
-
|
43 |
-
|
|
|
|
7 |
|
8 |
HTTP_PROXY = os.environ.get("HTTP_PROXY", None)
|
9 |
|
|
|
10 |
class Scraper:
|
11 |
@decorator_asyncio_fix
|
12 |
async def _get_html(self, session, url):
|
|
|
20 |
return await asyncio.gather(asyncio.create_task(self._get_html(session, url)))
|
21 |
|
22 |
|
23 |
+
class CloudScraper:
|
24 |
def __init__(self):
|
25 |
self.scraper = cloudscraper.create_scraper()
|
26 |
|
27 |
+
def _get_html(self, url):
|
|
|
|
|
|
|
|
|
|
|
28 |
try:
|
29 |
+
response = self.scraper.get(url, headers=HEADER_AIO, proxies={'http': HTTP_PROXY, 'https': HTTP_PROXY})
|
30 |
+
response.raise_for_status() # Raise an error for bad responses
|
31 |
return response.text
|
32 |
except Exception as e:
|
33 |
+
print(f"Error fetching {url}: {e}")
|
34 |
return None
|
35 |
|
36 |
+
async def get_all_results(self, url):
|
37 |
+
loop = asyncio.get_event_loop()
|
38 |
+
with ThreadPoolExecutor() as pool:
|
39 |
+
return await loop.run_in_executor(pool, self._get_html, url)
|