enesmanan commited on
Commit
9cbdd01
·
verified ·
1 Parent(s): b42b622

add chrome webdiriver docker file

Browse files
Files changed (3) hide show
  1. .gitignore +171 -0
  2. Dockerfile +16 -0
  3. scrape/trendyol_scraper.py +23 -9
.gitignore ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+
5
+ # RAG
6
+ chroma_*
7
+ chroma
8
+ .DS_Store
9
+
10
+ #experimantal
11
+ agent_experimental.ipynb
12
+
13
+ # C extensions
14
+ *.so
15
+
16
+ # Distribution / packaging
17
+ .Python
18
+ build/
19
+ develop-eggs/
20
+ dist/
21
+ downloads/
22
+ eggs/
23
+ .eggs/
24
+ lib/
25
+ lib64/
26
+ parts/
27
+ sdist/
28
+ var/
29
+ wheels/
30
+ share/python-wheels/
31
+ *.egg-info/
32
+ .installed.cfg
33
+ *.egg
34
+ MANIFEST
35
+
36
+ # PyInstaller
37
+ # Usually these files are written by a python script from a template
38
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
39
+ *.manifest
40
+ *.spec
41
+
42
+ # Installer logs
43
+ pip-log.txt
44
+ pip-delete-this-directory.txt
45
+
46
+ # Unit test / coverage reports
47
+ htmlcov/
48
+ .tox/
49
+ .nox/
50
+ .coverage
51
+ .coverage.*
52
+ .cache
53
+ nosetests.xml
54
+ coverage.xml
55
+ *.cover
56
+ *.py,cover
57
+ .hypothesis/
58
+ .pytest_cache/
59
+ cover/
60
+
61
+ # Translations
62
+ *.mo
63
+ *.pot
64
+
65
+ # Django stuff:
66
+ *.log
67
+ local_settings.py
68
+ db.sqlite3
69
+ db.sqlite3-journal
70
+
71
+ # Flask stuff:
72
+ instance/
73
+ .webassets-cache
74
+
75
+ # Scrapy stuff:
76
+ .scrapy
77
+
78
+ # Sphinx documentation
79
+ docs/_build/
80
+
81
+ # PyBuilder
82
+ .pybuilder/
83
+ target/
84
+
85
+ # Jupyter Notebook
86
+ .ipynb_checkpoints
87
+
88
+ # IPython
89
+ profile_default/
90
+ ipython_config.py
91
+
92
+ # pyenv
93
+ # For a library or package, you might want to ignore these files since the code is
94
+ # intended to run in multiple environments; otherwise, check them in:
95
+ # .python-version
96
+
97
+ # pipenv
98
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
99
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
100
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
101
+ # install all needed dependencies.
102
+ #Pipfile.lock
103
+
104
+ # poetry
105
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
107
+ # commonly ignored for libraries.
108
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109
+ #poetry.lock
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ #pdm.lock
114
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
115
+ # in version control.
116
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
117
+ .pdm.toml
118
+ .pdm-python
119
+ .pdm-build/
120
+
121
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
122
+ __pypackages__/
123
+
124
+ # Celery stuff
125
+ celerybeat-schedule
126
+ celerybeat.pid
127
+
128
+ # SageMath parsed files
129
+ *.sage.py
130
+
131
+ # Environments
132
+ .env
133
+ .venv
134
+ env/
135
+ venv/
136
+ ENV/
137
+ env.bak/
138
+ venv.bak/
139
+ .streamlit/
140
+ secrets.toml
141
+
142
+ # Spyder project settings
143
+ .spyderproject
144
+ .spyproject
145
+
146
+ # Rope project settings
147
+ .ropeproject
148
+
149
+ # mkdocs documentation
150
+ /site
151
+
152
+ # mypy
153
+ .mypy_cache/
154
+ .dmypy.json
155
+ dmypy.json
156
+
157
+ # Pyre type checker
158
+ .pyre/
159
+
160
+ # pytype static type analyzer
161
+ .pytype/
162
+
163
+ # Cython debug symbols
164
+ cython_debug/
165
+
166
+ # PyCharm
167
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
168
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
169
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
170
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
171
+ #.idea/
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Chrome ve ChromeDriver kurulumu
6
+ RUN apt-get update && apt-get install -y \
7
+ chromium \
8
+ chromium-driver \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ COPY requirements.txt .
12
+ RUN pip install -r requirements.txt
13
+
14
+ COPY . .
15
+
16
+ CMD ["python", "app.py"]
scrape/trendyol_scraper.py CHANGED
@@ -3,9 +3,12 @@ from selenium.webdriver.chrome.service import Service
3
  from selenium.webdriver.common.by import By
4
  from selenium.webdriver.support.ui import WebDriverWait
5
  from selenium.webdriver.support import expected_conditions as EC
 
6
  import time
7
  import pandas as pd
8
  import os
 
 
9
 
10
  def scrape_reviews(url):
11
  # Create data directory if it doesn't exist
@@ -23,21 +26,31 @@ def scrape_reviews(url):
23
  break
24
  last_height = new_height
25
 
26
- chrome_options = webdriver.ChromeOptions()
 
27
  chrome_options.add_argument('--headless')
28
- chrome_options.add_argument('--disable-gpu')
29
  chrome_options.add_argument('--no-sandbox')
30
  chrome_options.add_argument('--disable-dev-shm-usage')
 
31
  chrome_options.add_argument("--window-size=1920,1080")
32
-
 
33
  try:
34
- service = Service() # Hugging Face Spaces için path belirtmeye gerek yok
 
 
 
 
35
  driver = webdriver.Chrome(service=service, options=chrome_options)
 
36
  driver.get(url)
37
-
38
- WebDriverWait(driver, 10).until(
39
- EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
40
- ).click()
 
 
 
41
 
42
  comprehensive_scroll(driver)
43
 
@@ -86,4 +99,5 @@ def scrape_reviews(url):
86
  return pd.DataFrame()
87
 
88
  finally:
89
- driver.quit()
 
 
3
  from selenium.webdriver.common.by import By
4
  from selenium.webdriver.support.ui import WebDriverWait
5
  from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.webdriver.chrome.options import Options
7
  import time
8
  import pandas as pd
9
  import os
10
+ from webdriver_manager.chrome import ChromeDriverManager
11
+ from selenium.webdriver.chrome.service import Service as ChromeService
12
 
13
  def scrape_reviews(url):
14
  # Create data directory if it doesn't exist
 
26
  break
27
  last_height = new_height
28
 
29
+ # Chrome options for Linux/Space environment
30
+ chrome_options = Options()
31
  chrome_options.add_argument('--headless')
 
32
  chrome_options.add_argument('--no-sandbox')
33
  chrome_options.add_argument('--disable-dev-shm-usage')
34
+ chrome_options.add_argument("--disable-gpu")
35
  chrome_options.add_argument("--window-size=1920,1080")
36
+ chrome_options.add_argument('--disable-blink-features=AutomationControlled')
37
+
38
  try:
39
+ # Linux için Chrome ve ChromeDriver kurulumu
40
+ os.system('apt-get update && apt-get install -y chromium-browser chromium-chromedriver')
41
+
42
+ # ChromeDriver'ı otomatik yönet
43
+ service = ChromeService()
44
  driver = webdriver.Chrome(service=service, options=chrome_options)
45
+
46
  driver.get(url)
47
+
48
+ try:
49
+ WebDriverWait(driver, 10).until(
50
+ EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
51
+ ).click()
52
+ except:
53
+ print("Çerez popup'ı bulunamadı, devam ediliyor...")
54
 
55
  comprehensive_scroll(driver)
56
 
 
99
  return pd.DataFrame()
100
 
101
  finally:
102
+ if 'driver' in locals():
103
+ driver.quit()