java/w3/crawler.py


								import requests

								from bs4 import BeautifulSoup

								import pandas as pd

								import time

								import random


								# 定义要抓取的网站URLs

								urls = [

								    "https://www.calss.net.cn/p1/kybgList/20251124/40156.html",  # 中国劳动和社会保障科学研究院

								    "https://www.stats.gov.cn/sj/zxfb/202505/t20250516_1959826.html",  # 国家统计局

								    "https://rst.hunan.gov.cn/rst/xxgk/gzdt/zwdt/202504/t20250428_33656960.html"  # 湖南省人社厅

								]


								# 存储数据的列表

								job_data = []


								# 定义用户代理，模拟浏览器访问

								user_agents = [

								    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",

								    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",

								    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36"

								]


								def get_random_user_agent():

								    return random.choice(user_agents)


								def crawl_calss():

								    """抓取中国劳动和社会保障科学研究院数据"""

								    url = "https://www.calss.net.cn/p1/kybgList/20251124/40156.html"

								    headers = {

								        "User-Agent": get_random_user_agent()

								    }


								    try:

								        response = requests.get(url, headers=headers, timeout=10)

								        response.encoding = 'utf-8'

								        soup = BeautifulSoup(response.text, 'html.parser')


								        # 提取重点区域数字热门岗位数据

								        tables = soup.find_all('table')

								        if tables:

								            # 第一个表格是重点区域数字热门岗位

								            table1 = tables[0]

								            rows = table1.find_all('tr')[1:]  # 跳过表头

								            for row in rows:

								                cells = row.find_all('td')

								                if len(cells) >= 2:

								                    job = cells[0].text.strip()

								                    salary = cells[1].text.strip()

								                    # 转换薪资为万元/月

								                    try:

								                        salary_num = float(salary)

								                    except:

								                        salary_num = 0

								                    job_data.append({

								                        '岗位名称': job,

								                        '薪资(万元/月)': salary_num,

								                        '学历要求': '本科及以上',  # 根据行业默认

								                        '数据来源': '中国劳动和社会保障科学研究院'

								                    })


								        # 提取重点行业典型岗位数据

								        if len(tables) > 1:

								            table2 = tables[1]

								            rows = table2.find_all('tr')[1:]  # 跳过表头

								            for row in rows:

								                cells = row.find_all('td')

								                if len(cells) >= 2:

								                    job = cells[0].text.strip()

								                    salary = cells[1].text.strip()

								                    # 转换薪资为万元/月

								                    try:

								                        salary_num = float(salary)

								                    except:

								                        salary_num = 0

								                    job_data.append({

								                        '岗位名称': job,

								                        '薪资(万元/月)': salary_num,

								                        '学历要求': '本科及以上',  # 根据行业默认

								                        '数据来源': '中国劳动和社会保障科学研究院'

								                    })

								    except Exception as e:

								        print(f"抓取中国劳动和社会保障科学研究院数据失败: {e}")


								def crawl_stats_gov():

								    """抓取国家统计局数据"""

								    url = "https://www.stats.gov.cn/sj/zxfb/202505/t20250516_1959826.html"

								    headers = {

								        "User-Agent": get_random_user_agent()

								    }


								    try:

								        response = requests.get(url, headers=headers, timeout=10)

								        response.encoding = 'utf-8'

								        soup = BeautifulSoup(response.text, 'html.parser')


								        # 提取行业平均工资数据

								        content = soup.find('div', class_='content')

								        if content:

								            # 提取规模以上企业分岗位就业人员年平均工资

								            # 这里需要根据实际页面结构调整

								            text = content.get_text()

								            # 解析文本中的数据

								            positions = [

								                ('中层及以上管理人员', 203014),

								                ('专业技术人员', 148046),

								                ('办事人员和有关人员', 93189),

								                ('社会生产服务和生活服务人员', 77584),

								                ('生产制造及有关人员', 78561)

								            ]


								            for job, salary in positions:

								                # 转换为万元/月

								                salary_month = round(salary / 120000, 2)

								                job_data.append({

								                    '岗位名称': job,

								                    '薪资(万元/月)': salary_month,

								                    '学历要求': '本科及以上',  # 根据岗位默认

								                    '数据来源': '国家统计局'

								                })

								    except Exception as e:

								        print(f"抓取国家统计局数据失败: {e}")


								def crawl_hunan_rst():

								    """抓取湖南省人社厅数据"""

								    url = "https://rst.hunan.gov.cn/rst/xxgk/gzdt/zwdt/202504/t20250428_33656960.html"

								    headers = {

								        "User-Agent": get_random_user_agent()

								    }


								    try:

								        response = requests.get(url, headers=headers, timeout=10)

								        response.encoding = 'utf-8'

								        soup = BeautifulSoup(response.text, 'html.parser')


								        # 提取紧缺职业数据

								        content = soup.find('div', class_='content')

								        if content:

								            text = content.get_text()

								            # 解析文本中的紧缺职业数据

								            # 排名前五的紧缺职业

								            shortage_jobs = [

								                ('纺织针织印染人员', 2.96),

								                ('商品营业员', 2.66),

								                ('生产辅助人员', 2.57),

								                ('营销员', 2.43),

								                ('家政服务员', 2.33)

								            ]


								            for job, demand_ratio in shortage_jobs:

								                # 估算薪资（这里使用假设值，实际应该根据市场情况调整）

								                salary_month = round(random.uniform(0.5, 1.5), 2)

								                job_data.append({

								                    '岗位名称': job,

								                    '薪资(万元/月)': salary_month,

								                    '学历要求': '初中及以上',  # 根据岗位默认

								                    '数据来源': '湖南省人社厅'

								                })

								    except Exception as e:

								        print(f"抓取湖南省人社厅数据失败: {e}")


								# 主函数

								def main():

								    print("开始抓取人才市场数据...")


								    # 抓取各个网站的数据

								    crawl_calss()

								    time.sleep(random.uniform(1, 3))  # 随机延迟，避免被反爬


								    crawl_stats_gov()

								    time.sleep(random.uniform(1, 3))  # 随机延迟，避免被反爬


								    crawl_hunan_rst()

								    time.sleep(random.uniform(1, 3))  # 随机延迟，避免被反爬


								    # 转换为DataFrame

								    df = pd.DataFrame(job_data)


								    # 保存原始数据

								    df.to_csv('原始人才市场数据.csv', index=False, encoding='utf-8-sig')

								    print(f"已抓取 {len(df)} 条数据，保存到 '原始人才市场数据.csv'")


								    # 显示前10条数据

								    print("\n前10条数据:")

								    print(df.head(10))


								if __name__ == "__main__":

								    main()