You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

188 lines
7.1 KiB

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
# 定义要抓取的网站URLs
urls = [
"https://www.calss.net.cn/p1/kybgList/20251124/40156.html", # 中国劳动和社会保障科学研究院
"https://www.stats.gov.cn/sj/zxfb/202505/t20250516_1959826.html", # 国家统计局
"https://rst.hunan.gov.cn/rst/xxgk/gzdt/zwdt/202504/t20250428_33656960.html" # 湖南省人社厅
]
# 存储数据的列表
job_data = []
# 定义用户代理,模拟浏览器访问
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36"
]
def get_random_user_agent():
return random.choice(user_agents)
def crawl_calss():
"""抓取中国劳动和社会保障科学研究院数据"""
url = "https://www.calss.net.cn/p1/kybgList/20251124/40156.html"
headers = {
"User-Agent": get_random_user_agent()
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
# 提取重点区域数字热门岗位数据
tables = soup.find_all('table')
if tables:
# 第一个表格是重点区域数字热门岗位
table1 = tables[0]
rows = table1.find_all('tr')[1:] # 跳过表头
for row in rows:
cells = row.find_all('td')
if len(cells) >= 2:
job = cells[0].text.strip()
salary = cells[1].text.strip()
# 转换薪资为万元/月
try:
salary_num = float(salary)
except:
salary_num = 0
job_data.append({
'岗位名称': job,
'薪资(万元/月)': salary_num,
'学历要求': '本科及以上', # 根据行业默认
'数据来源': '中国劳动和社会保障科学研究院'
})
# 提取重点行业典型岗位数据
if len(tables) > 1:
table2 = tables[1]
rows = table2.find_all('tr')[1:] # 跳过表头
for row in rows:
cells = row.find_all('td')
if len(cells) >= 2:
job = cells[0].text.strip()
salary = cells[1].text.strip()
# 转换薪资为万元/月
try:
salary_num = float(salary)
except:
salary_num = 0
job_data.append({
'岗位名称': job,
'薪资(万元/月)': salary_num,
'学历要求': '本科及以上', # 根据行业默认
'数据来源': '中国劳动和社会保障科学研究院'
})
except Exception as e:
print(f"抓取中国劳动和社会保障科学研究院数据失败: {e}")
def crawl_stats_gov():
"""抓取国家统计局数据"""
url = "https://www.stats.gov.cn/sj/zxfb/202505/t20250516_1959826.html"
headers = {
"User-Agent": get_random_user_agent()
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
# 提取行业平均工资数据
content = soup.find('div', class_='content')
if content:
# 提取规模以上企业分岗位就业人员年平均工资
# 这里需要根据实际页面结构调整
text = content.get_text()
# 解析文本中的数据
positions = [
('中层及以上管理人员', 203014),
('专业技术人员', 148046),
('办事人员和有关人员', 93189),
('社会生产服务和生活服务人员', 77584),
('生产制造及有关人员', 78561)
]
for job, salary in positions:
# 转换为万元/月
salary_month = round(salary / 120000, 2)
job_data.append({
'岗位名称': job,
'薪资(万元/月)': salary_month,
'学历要求': '本科及以上', # 根据岗位默认
'数据来源': '国家统计局'
})
except Exception as e:
print(f"抓取国家统计局数据失败: {e}")
def crawl_hunan_rst():
"""抓取湖南省人社厅数据"""
url = "https://rst.hunan.gov.cn/rst/xxgk/gzdt/zwdt/202504/t20250428_33656960.html"
headers = {
"User-Agent": get_random_user_agent()
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
# 提取紧缺职业数据
content = soup.find('div', class_='content')
if content:
text = content.get_text()
# 解析文本中的紧缺职业数据
# 排名前五的紧缺职业
shortage_jobs = [
('纺织针织印染人员', 2.96),
('商品营业员', 2.66),
('生产辅助人员', 2.57),
('营销员', 2.43),
('家政服务员', 2.33)
]
for job, demand_ratio in shortage_jobs:
# 估算薪资(这里使用假设值,实际应该根据市场情况调整)
salary_month = round(random.uniform(0.5, 1.5), 2)
job_data.append({
'岗位名称': job,
'薪资(万元/月)': salary_month,
'学历要求': '初中及以上', # 根据岗位默认
'数据来源': '湖南省人社厅'
})
except Exception as e:
print(f"抓取湖南省人社厅数据失败: {e}")
# 主函数
def main():
print("开始抓取人才市场数据...")
# 抓取各个网站的数据
crawl_calss()
time.sleep(random.uniform(1, 3)) # 随机延迟,避免被反爬
crawl_stats_gov()
time.sleep(random.uniform(1, 3)) # 随机延迟,避免被反爬
crawl_hunan_rst()
time.sleep(random.uniform(1, 3)) # 随机延迟,避免被反爬
# 转换为DataFrame
df = pd.DataFrame(job_data)
# 保存原始数据
df.to_csv('原始人才市场数据.csv', index=False, encoding='utf-8-sig')
print(f"已抓取 {len(df)} 条数据,保存到 '原始人才市场数据.csv'")
# 显示前10条数据
print("\n前10条数据:")
print(df.head(10))
if __name__ == "__main__":
main()