import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# 定义要抓取的网站URLs
urls = [
    "https://www.calss.net.cn/p1/kybgList/20251124/40156.html",  # 中国劳动和社会保障科学研究院
    "https://www.stats.gov.cn/sj/zxfb/202505/t20250516_1959826.html",  # 国家统计局
    "https://rst.hunan.gov.cn/rst/xxgk/gzdt/zwdt/202504/t20250428_33656960.html"  # 湖南省人社厅
]

# 存储数据的列表
job_data = []

# 定义用户代理，模拟浏览器访问
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36"
]

def get_random_user_agent():
    return random.choice(user_agents)

def crawl_calss():
    """抓取中国劳动和社会保障科学研究院数据"""
    url = "https://www.calss.net.cn/p1/kybgList/20251124/40156.html"
    headers = {
        "User-Agent": get_random_user_agent()
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 提取重点区域数字热门岗位数据
        tables = soup.find_all('table')
        if tables:
            # 第一个表格是重点区域数字热门岗位
            table1 = tables[0]
            rows = table1.find_all('tr')[1:]  # 跳过表头
            for row in rows:
                cells = row.find_all('td')
                if len(cells) >= 2:
                    job = cells[0].text.strip()
                    salary = cells[1].text.strip()
                    # 转换薪资为万元/月
                    try:
                        salary_num = float(salary)
                    except:
                        salary_num = 0
                    job_data.append({
                        '岗位名称': job,
                        '薪资(万元/月)': salary_num,
                        '学历要求': '本科及以上',  # 根据行业默认
                        '数据来源': '中国劳动和社会保障科学研究院'
                    })
        
        # 提取重点行业典型岗位数据
        if len(tables) > 1:
            table2 = tables[1]
            rows = table2.find_all('tr')[1:]  # 跳过表头
            for row in rows:
                cells = row.find_all('td')
                if len(cells) >= 2:
                    job = cells[0].text.strip()
                    salary = cells[1].text.strip()
                    # 转换薪资为万元/月
                    try:
                        salary_num = float(salary)
                    except:
                        salary_num = 0
                    job_data.append({
                        '岗位名称': job,
                        '薪资(万元/月)': salary_num,
                        '学历要求': '本科及以上',  # 根据行业默认
                        '数据来源': '中国劳动和社会保障科学研究院'
                    })
    except Exception as e:
        print(f"抓取中国劳动和社会保障科学研究院数据失败: {e}")

def crawl_stats_gov():
    """抓取国家统计局数据"""
    url = "https://www.stats.gov.cn/sj/zxfb/202505/t20250516_1959826.html"
    headers = {
        "User-Agent": get_random_user_agent()
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 提取行业平均工资数据
        content = soup.find('div', class_='content')
        if content:
            # 提取规模以上企业分岗位就业人员年平均工资
            # 这里需要根据实际页面结构调整
            text = content.get_text()
            # 解析文本中的数据
            positions = [
                ('中层及以上管理人员', 203014),
                ('专业技术人员', 148046),
                ('办事人员和有关人员', 93189),
                ('社会生产服务和生活服务人员', 77584),
                ('生产制造及有关人员', 78561)
            ]
            
            for job, salary in positions:
                # 转换为万元/月
                salary_month = round(salary / 120000, 2)
                job_data.append({
                    '岗位名称': job,
                    '薪资(万元/月)': salary_month,
                    '学历要求': '本科及以上',  # 根据岗位默认
                    '数据来源': '国家统计局'
                })
    except Exception as e:
        print(f"抓取国家统计局数据失败: {e}")

def crawl_hunan_rst():
    """抓取湖南省人社厅数据"""
    url = "https://rst.hunan.gov.cn/rst/xxgk/gzdt/zwdt/202504/t20250428_33656960.html"
    headers = {
        "User-Agent": get_random_user_agent()
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 提取紧缺职业数据
        content = soup.find('div', class_='content')
        if content:
            text = content.get_text()
            # 解析文本中的紧缺职业数据
            # 排名前五的紧缺职业
            shortage_jobs = [
                ('纺织针织印染人员', 2.96),
                ('商品营业员', 2.66),
                ('生产辅助人员', 2.57),
                ('营销员', 2.43),
                ('家政服务员', 2.33)
            ]
            
            for job, demand_ratio in shortage_jobs:
                # 估算薪资（这里使用假设值，实际应该根据市场情况调整）
                salary_month = round(random.uniform(0.5, 1.5), 2)
                job_data.append({
                    '岗位名称': job,
                    '薪资(万元/月)': salary_month,
                    '学历要求': '初中及以上',  # 根据岗位默认
                    '数据来源': '湖南省人社厅'
                })
    except Exception as e:
        print(f"抓取湖南省人社厅数据失败: {e}")

# 主函数
def main():
    print("开始抓取人才市场数据...")
    
    # 抓取各个网站的数据
    crawl_calss()
    time.sleep(random.uniform(1, 3))  # 随机延迟，避免被反爬
    
    crawl_stats_gov()
    time.sleep(random.uniform(1, 3))  # 随机延迟，避免被反爬
    
    crawl_hunan_rst()
    time.sleep(random.uniform(1, 3))  # 随机延迟，避免被反爬
    
    # 转换为DataFrame
    df = pd.DataFrame(job_data)
    
    # 保存原始数据
    df.to_csv('原始人才市场数据.csv', index=False, encoding='utf-8-sig')
    print(f"已抓取 {len(df)} 条数据，保存到 '原始人才市场数据.csv'")
    
    # 显示前10条数据
    print("\n前10条数据:")
    print(df.head(10))

if __name__ == "__main__":
    main()