You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
188 lines
7.1 KiB
188 lines
7.1 KiB
import requests
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
import time
|
|
import random
|
|
|
|
# 定义要抓取的网站URLs
|
|
urls = [
|
|
"https://www.calss.net.cn/p1/kybgList/20251124/40156.html", # 中国劳动和社会保障科学研究院
|
|
"https://www.stats.gov.cn/sj/zxfb/202505/t20250516_1959826.html", # 国家统计局
|
|
"https://rst.hunan.gov.cn/rst/xxgk/gzdt/zwdt/202504/t20250428_33656960.html" # 湖南省人社厅
|
|
]
|
|
|
|
# 存储数据的列表
|
|
job_data = []
|
|
|
|
# 定义用户代理,模拟浏览器访问
|
|
user_agents = [
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36"
|
|
]
|
|
|
|
def get_random_user_agent():
|
|
return random.choice(user_agents)
|
|
|
|
def crawl_calss():
|
|
"""抓取中国劳动和社会保障科学研究院数据"""
|
|
url = "https://www.calss.net.cn/p1/kybgList/20251124/40156.html"
|
|
headers = {
|
|
"User-Agent": get_random_user_agent()
|
|
}
|
|
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.encoding = 'utf-8'
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# 提取重点区域数字热门岗位数据
|
|
tables = soup.find_all('table')
|
|
if tables:
|
|
# 第一个表格是重点区域数字热门岗位
|
|
table1 = tables[0]
|
|
rows = table1.find_all('tr')[1:] # 跳过表头
|
|
for row in rows:
|
|
cells = row.find_all('td')
|
|
if len(cells) >= 2:
|
|
job = cells[0].text.strip()
|
|
salary = cells[1].text.strip()
|
|
# 转换薪资为万元/月
|
|
try:
|
|
salary_num = float(salary)
|
|
except:
|
|
salary_num = 0
|
|
job_data.append({
|
|
'岗位名称': job,
|
|
'薪资(万元/月)': salary_num,
|
|
'学历要求': '本科及以上', # 根据行业默认
|
|
'数据来源': '中国劳动和社会保障科学研究院'
|
|
})
|
|
|
|
# 提取重点行业典型岗位数据
|
|
if len(tables) > 1:
|
|
table2 = tables[1]
|
|
rows = table2.find_all('tr')[1:] # 跳过表头
|
|
for row in rows:
|
|
cells = row.find_all('td')
|
|
if len(cells) >= 2:
|
|
job = cells[0].text.strip()
|
|
salary = cells[1].text.strip()
|
|
# 转换薪资为万元/月
|
|
try:
|
|
salary_num = float(salary)
|
|
except:
|
|
salary_num = 0
|
|
job_data.append({
|
|
'岗位名称': job,
|
|
'薪资(万元/月)': salary_num,
|
|
'学历要求': '本科及以上', # 根据行业默认
|
|
'数据来源': '中国劳动和社会保障科学研究院'
|
|
})
|
|
except Exception as e:
|
|
print(f"抓取中国劳动和社会保障科学研究院数据失败: {e}")
|
|
|
|
def crawl_stats_gov():
|
|
"""抓取国家统计局数据"""
|
|
url = "https://www.stats.gov.cn/sj/zxfb/202505/t20250516_1959826.html"
|
|
headers = {
|
|
"User-Agent": get_random_user_agent()
|
|
}
|
|
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.encoding = 'utf-8'
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# 提取行业平均工资数据
|
|
content = soup.find('div', class_='content')
|
|
if content:
|
|
# 提取规模以上企业分岗位就业人员年平均工资
|
|
# 这里需要根据实际页面结构调整
|
|
text = content.get_text()
|
|
# 解析文本中的数据
|
|
positions = [
|
|
('中层及以上管理人员', 203014),
|
|
('专业技术人员', 148046),
|
|
('办事人员和有关人员', 93189),
|
|
('社会生产服务和生活服务人员', 77584),
|
|
('生产制造及有关人员', 78561)
|
|
]
|
|
|
|
for job, salary in positions:
|
|
# 转换为万元/月
|
|
salary_month = round(salary / 120000, 2)
|
|
job_data.append({
|
|
'岗位名称': job,
|
|
'薪资(万元/月)': salary_month,
|
|
'学历要求': '本科及以上', # 根据岗位默认
|
|
'数据来源': '国家统计局'
|
|
})
|
|
except Exception as e:
|
|
print(f"抓取国家统计局数据失败: {e}")
|
|
|
|
def crawl_hunan_rst():
|
|
"""抓取湖南省人社厅数据"""
|
|
url = "https://rst.hunan.gov.cn/rst/xxgk/gzdt/zwdt/202504/t20250428_33656960.html"
|
|
headers = {
|
|
"User-Agent": get_random_user_agent()
|
|
}
|
|
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.encoding = 'utf-8'
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# 提取紧缺职业数据
|
|
content = soup.find('div', class_='content')
|
|
if content:
|
|
text = content.get_text()
|
|
# 解析文本中的紧缺职业数据
|
|
# 排名前五的紧缺职业
|
|
shortage_jobs = [
|
|
('纺织针织印染人员', 2.96),
|
|
('商品营业员', 2.66),
|
|
('生产辅助人员', 2.57),
|
|
('营销员', 2.43),
|
|
('家政服务员', 2.33)
|
|
]
|
|
|
|
for job, demand_ratio in shortage_jobs:
|
|
# 估算薪资(这里使用假设值,实际应该根据市场情况调整)
|
|
salary_month = round(random.uniform(0.5, 1.5), 2)
|
|
job_data.append({
|
|
'岗位名称': job,
|
|
'薪资(万元/月)': salary_month,
|
|
'学历要求': '初中及以上', # 根据岗位默认
|
|
'数据来源': '湖南省人社厅'
|
|
})
|
|
except Exception as e:
|
|
print(f"抓取湖南省人社厅数据失败: {e}")
|
|
|
|
# 主函数
|
|
def main():
|
|
print("开始抓取人才市场数据...")
|
|
|
|
# 抓取各个网站的数据
|
|
crawl_calss()
|
|
time.sleep(random.uniform(1, 3)) # 随机延迟,避免被反爬
|
|
|
|
crawl_stats_gov()
|
|
time.sleep(random.uniform(1, 3)) # 随机延迟,避免被反爬
|
|
|
|
crawl_hunan_rst()
|
|
time.sleep(random.uniform(1, 3)) # 随机延迟,避免被反爬
|
|
|
|
# 转换为DataFrame
|
|
df = pd.DataFrame(job_data)
|
|
|
|
# 保存原始数据
|
|
df.to_csv('原始人才市场数据.csv', index=False, encoding='utf-8-sig')
|
|
print(f"已抓取 {len(df)} 条数据,保存到 '原始人才市场数据.csv'")
|
|
|
|
# 显示前10条数据
|
|
print("\n前10条数据:")
|
|
print(df.head(10))
|
|
|
|
if __name__ == "__main__":
|
|
main()
|