SoulofSukuna commited on
Commit
46ed472
·
verified ·
1 Parent(s): 326883a

Update helper/html_scraper.py

Browse files
Files changed (1) hide show
  1. helper/html_scraper.py +9 -13
helper/html_scraper.py CHANGED
@@ -7,7 +7,6 @@ from constants.headers import HEADER_AIO
7
 
8
  HTTP_PROXY = os.environ.get("HTTP_PROXY", None)
9
 
10
-
11
  class Scraper:
12
  @decorator_asyncio_fix
13
  async def _get_html(self, session, url):
@@ -21,23 +20,20 @@ class Scraper:
21
  return await asyncio.gather(asyncio.create_task(self._get_html(session, url)))
22
 
23
 
24
- class AsyncCloudscraper:
25
  def __init__(self):
26
  self.scraper = cloudscraper.create_scraper()
27
 
28
- @decorator_asyncio_fix
29
- async def _get_html(self, url):
30
- loop = asyncio.get_running_loop()
31
- return await loop.run_in_executor(None, self._sync_scrape, url)
32
-
33
- def _sync_scrape(self, url):
34
  try:
35
- response = self.scraper.get(url, headers=HEADER_AIO)
 
36
  return response.text
37
  except Exception as e:
38
- print(f"Error occurred while fetching {url}: {e}")
39
  return None
40
 
41
- async def get_all_results(self, urls):
42
- tasks = [asyncio.create_task(self._get_html(url)) for url in urls]
43
- return await asyncio.gather(*tasks)
 
 
7
 
8
  HTTP_PROXY = os.environ.get("HTTP_PROXY", None)
9
 
 
10
  class Scraper:
11
  @decorator_asyncio_fix
12
  async def _get_html(self, session, url):
 
20
  return await asyncio.gather(asyncio.create_task(self._get_html(session, url)))
21
 
22
 
23
+ class CloudScraper:
24
  def __init__(self):
25
  self.scraper = cloudscraper.create_scraper()
26
 
27
+ def _get_html(self, url):
 
 
 
 
 
28
  try:
29
+ response = self.scraper.get(url, headers=HEADER_AIO, proxies={'http': HTTP_PROXY, 'https': HTTP_PROXY})
30
+ response.raise_for_status() # Raise an error for bad responses
31
  return response.text
32
  except Exception as e:
33
+ print(f"Error fetching {url}: {e}")
34
  return None
35
 
36
+ async def get_all_results(self, url):
37
+ loop = asyncio.get_event_loop()
38
+ with ThreadPoolExecutor() as pool:
39
+ return await loop.run_in_executor(pool, self._get_html, url)