File size: 2,524 Bytes
a7c5a8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import requests
from bs4 import BeautifulSoup
import pandas as pd
import gradio as gr

def scrape_104_jobs(url, num_pages=1):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    all_jobs = []

    for page in range(1, num_pages + 1):
        page_url = url.replace('page=2', f'page={page}')
        try:
            response = requests.get(page_url, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            job_items = soup.find_all('article', class_='js-job-item')
            for item in job_items:
                job_link = item.find('a', class_='js-job-link')
                company_name = item.find('ul', class_='b-list-inline b-clearfix')
                job_loc = item.find('ul', class_='b-list-inline b-clearfix job-list-intro b-content')

                if job_link and company_name and job_loc:
                    title = job_link.text.strip()
                    link = 'https:' + job_link['href'] if job_link['href'].startswith('//') else job_link['href']
                    company = company_name.find('a').text.strip()
                    location = job_loc.find('li').text.strip()
                    all_jobs.append({
                        'Job Title': title,
                        'Company': company,
                        'Location': location,
                        'Link': link
                    })

        except requests.RequestException as e:
            print(f"Error fetching page {page}: {e}")
            continue

    return pd.DataFrame(all_jobs)

def get_jobs_from_104(pages):
    url = 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=AI&expansionType=area%2Cspec%2Ccom%2Cjob%2Cwf%2Cwktm&order=14&asc=0&page=2&mode=s&jobsource=index_s&langFlag=0&langStatus=0&recommendJob=1&hotJob=1'
    df = scrape_104_jobs(url, num_pages=pages)
    return df

# 使用 Gradio 構建網頁界面
def display_jobs(num_pages):
    df = get_jobs_from_104(num_pages)
    return df

# 建立Gradio介面
interface = gr.Interface(
    fn=display_jobs,             # 呼叫的函數
    inputs=gr.Number(label="Enter number of pages to scrape (1-5):"),  # 用戶輸入的頁數
    outputs="dataframe",         # 輸出的格式為DataFrame
    title="104 Job Scraper",
    description="爬取 104 人力銀行上的職缺數據,輸入要抓取的頁數 (1-5)。"
)

# 啟動介面
interface.launch()