33 changed files with 836 additions and 6 deletions
@ -1,3 +1,28 @@ |
|||
package w5; |
|||
|
|||
public abstract class Shape { |
|||
public abstract void draw(); |
|||
} |
|||
} |
|||
class Circle extends Shape { |
|||
@Override |
|||
public void draw() { |
|||
System.out.println("绘制一个圆形"); |
|||
} |
|||
} |
|||
class Rectangle extends Shape { |
|||
@Override |
|||
public void draw() { |
|||
System.out.println("绘制一个矩形"); |
|||
} |
|||
} |
|||
class ShapeTest { |
|||
public static void drawShape(Shape s) { |
|||
s.draw(); |
|||
} |
|||
public static void main(String[] args) { |
|||
Shape circle = new Circle(); |
|||
Shape rectangle = new Rectangle(); |
|||
drawShape(circle); |
|||
drawShape(rectangle); |
|||
} |
|||
} |
|||
|
|||
@ -0,0 +1,33 @@ |
|||
package w7; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.FileReader; |
|||
import java.io.IOException; |
|||
|
|||
public class SaveAerage { |
|||
public static void main([String[] args]){ |
|||
String filePath = "scores.txt"; |
|||
int sum = 0; |
|||
int count = 0; |
|||
try (BufferedReader br = new BufferedReader(new FileReader(filePath))) { |
|||
String line; |
|||
while ((line = br.readLine()) != null) { |
|||
int score = Integer.parseInt(line.trim()); |
|||
sum += score; |
|||
count++; |
|||
} catch (NumberFormatException e) { |
|||
System.out.printlm("数字格式错误,跳过无效数据: " + line); |
|||
} |
|||
} |
|||
if (count == 0) { |
|||
System.out.println("文件中没有有效成绩数据"); |
|||
} else { |
|||
double average = (double) sum / count; |
|||
System.out.println("平均分: %.2f%n" ,average); |
|||
} |
|||
} catch (java.io.FileNotFoundException e) { |
|||
System.err.println("错误:文件不存在 —— " + filePath); |
|||
} catch (IOException e) { |
|||
System.err.println("错误:读取文件时发生异常 —— " + e.getMessage()); |
|||
} |
|||
} |
|||
@ -0,0 +1,146 @@ |
|||
# 大数据招聘爬虫项目 |
|||
|
|||
## 项目简介 |
|||
这是一个适合新手学习的大数据招聘信息爬虫项目,使用Java语言开发,实现了爬取、处理和存储招聘信息的完整流程。 |
|||
|
|||
## 技术栈 |
|||
- **开发语言**: Java 11+ |
|||
- **爬虫框架**: Jsoup 1.17.2 |
|||
- **构建工具**: Maven 3.6+ |
|||
- **数据存储**: CSV文件 |
|||
|
|||
## 项目结构 |
|||
``` |
|||
bigdata-job-crawler/ |
|||
├── src/ |
|||
│ ├── main/ |
|||
│ │ ├── java/ |
|||
│ │ │ └── com/ |
|||
│ │ │ └── example/ |
|||
│ │ │ ├── crawler/ |
|||
│ │ │ │ └── JobCrawler.java # 爬虫核心类 |
|||
│ │ │ ├── processor/ |
|||
│ │ │ │ └── DataProcessor.java # 数据处理类 |
|||
│ │ │ ├── storage/ |
|||
│ │ │ │ └── DataStorage.java # 数据存储类 |
|||
│ │ │ └── Main.java # 主类 |
|||
│ │ └── resources/ |
|||
├── pom.xml # Maven配置文件 |
|||
├── bigdata_jobs.csv # 爬取结果文件 |
|||
└── README.md # 项目说明 |
|||
``` |
|||
|
|||
## 环境要求 |
|||
- JDK 11或更高版本 |
|||
- Maven 3.6或更高版本 |
|||
|
|||
## 安装步骤 |
|||
|
|||
### 1. 安装JDK |
|||
从Oracle官网下载并安装JDK 11+,配置JAVA_HOME环境变量。 |
|||
|
|||
### 2. 安装Maven |
|||
从Apache官网下载并安装Maven 3.6+,配置MAVEN_HOME环境变量。 |
|||
|
|||
### 3. 验证环境 |
|||
```bash |
|||
java -version |
|||
mvn -version |
|||
``` |
|||
|
|||
## 使用方法 |
|||
|
|||
### 1. 编译项目 |
|||
```bash |
|||
mvn clean compile |
|||
``` |
|||
|
|||
### 2. 运行项目 |
|||
```bash |
|||
# 方法1:使用java命令直接运行 |
|||
java -cp "target/classes;C:\Users\lenovo\.m2\repository\org\jsoup\jsoup\1.17.2\jsoup-1.17.2.jar" com.example.Main |
|||
|
|||
# 方法2:使用Maven exec插件(需要先在pom.xml中配置) |
|||
mvn exec:java -Dexec.mainClass="com.example.Main" |
|||
``` |
|||
|
|||
### 3. 查看结果 |
|||
程序运行后,会在项目根目录生成 `bigdata_jobs.csv` 文件,包含爬取的职位信息。 |
|||
|
|||
## 项目说明 |
|||
|
|||
### 核心类介绍 |
|||
|
|||
#### 1. JobCrawler.java |
|||
负责爬取招聘信息,目前使用模拟数据进行演示。在实际项目中,可以替换为真实的爬取逻辑: |
|||
- 使用Jsoup发送HTTP请求 |
|||
- 解析HTML页面,提取职位信息 |
|||
- 支持多种选择器,适应不同的网站结构 |
|||
|
|||
#### 2. DataProcessor.java |
|||
负责处理爬取的数据: |
|||
- 清理空白字符 |
|||
- 标准化数据格式 |
|||
- 数据验证 |
|||
|
|||
#### 3. DataStorage.java |
|||
负责将处理后的数据保存为CSV文件: |
|||
- 生成CSV格式文件 |
|||
- 支持自定义文件路径 |
|||
- 使用缓冲写入,提高性能 |
|||
|
|||
#### 4. Main.java |
|||
程序的主入口,协调整个爬虫流程: |
|||
- 调用爬虫获取数据 |
|||
- 调用处理器清洗数据 |
|||
- 调用存储器保存数据 |
|||
|
|||
## 当前功能 |
|||
- ✅ 爬取大数据相关职位信息 |
|||
- ✅ 数据清洗和处理 |
|||
- ✅ 数据存储为CSV文件 |
|||
- ✅ 支持多种职位信息字段(职位名称、薪资、公司、地点、经验、学历) |
|||
|
|||
## 注意事项 |
|||
1. **模拟数据**: 当前版本使用模拟数据进行演示,实际项目中需要替换为真实的爬取逻辑 |
|||
2. **反爬机制**: 实际爬取时需要注意网站的反爬机制,建议: |
|||
- 添加合理的请求延迟 |
|||
- 使用代理IP池 |
|||
- 设置合适的User-Agent |
|||
3. **法律合规**: 爬取数据时请遵守相关法律法规和网站的使用条款 |
|||
|
|||
## 扩展建议 |
|||
1. **支持更多网站**: 添加前程无忧、猎聘网等招聘平台的支持 |
|||
2. **数据可视化**: 使用JFreeChart等库生成图表 |
|||
3. **定时任务**: 使用Quartz实现定时爬取 |
|||
4. **数据库存储**: 使用MySQL等数据库替代CSV文件 |
|||
5. **数据分析**: 添加薪资分析、技能需求分析等功能 |
|||
|
|||
## 常见问题 |
|||
|
|||
### Q: 如何修改爬取的职位数量? |
|||
A: 在JobCrawler.java中修改循环次数或条件。 |
|||
|
|||
### Q: 如何添加新的招聘网站? |
|||
A: 在JobCrawler.java中添加新的爬取方法,或修改现有方法以支持新的URL。 |
|||
|
|||
### Q: CSV文件在哪里? |
|||
A: 默认在项目根目录下,文件名为 `bigdata_jobs.csv`。 |
|||
|
|||
### Q: 如何修改保存路径? |
|||
A: 在Main.java中修改 `filePath` 变量的值。 |
|||
|
|||
## 学习资源 |
|||
- [Jsoup官方文档](https://jsoup.org/) |
|||
- [Maven官方文档](https://maven.apache.org/) |
|||
- [Java官方文档](https://docs.oracle.com/en/java/) |
|||
|
|||
## 许可证 |
|||
本项目仅供学习交流使用。 |
|||
|
|||
## 联系方式 |
|||
如有问题或建议,欢迎交流讨论。 |
|||
|
|||
--- |
|||
|
|||
**祝您学习愉快!** |
|||
|
Binary file not shown.
@ -0,0 +1,38 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" |
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|||
<modelVersion>4.0.0</modelVersion> |
|||
|
|||
<groupId>com.example</groupId> |
|||
<artifactId>bigdata-job-crawler</artifactId> |
|||
<version>1.0-SNAPSHOT</version> |
|||
|
|||
<properties> |
|||
<maven.compiler.source>11</maven.compiler.source> |
|||
<maven.compiler.target>11</maven.compiler.target> |
|||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
|||
</properties> |
|||
|
|||
<dependencies> |
|||
<dependency> |
|||
<groupId>org.jsoup</groupId> |
|||
<artifactId>jsoup</artifactId> |
|||
<version>1.17.2</version> |
|||
</dependency> |
|||
</dependencies> |
|||
|
|||
<build> |
|||
<plugins> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-compiler-plugin</artifactId> |
|||
<version>3.8.1</version> |
|||
<configuration> |
|||
<source>11</source> |
|||
<target>11</target> |
|||
</configuration> |
|||
</plugin> |
|||
</plugins> |
|||
</build> |
|||
</project> |
|||
@ -0,0 +1,53 @@ |
|||
package com.example; |
|||
|
|||
import com.example.crawler.BaseCrawler; |
|||
import com.example.crawler.MockCrawler; |
|||
import com.example.crawler.ZhaopinCrawler; |
|||
import com.example.crawler.Job51Crawler; |
|||
import com.example.processor.DataProcessor; |
|||
import com.example.storage.DataStorage; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
public class Main { |
|||
public static void main(String[] args) { |
|||
try { |
|||
// 创建不同的爬虫实例(多态:使用父类引用指向子类对象)
|
|||
BaseCrawler[] crawlers = { |
|||
new MockCrawler(5), // 模拟数据爬虫,爬取5个职位
|
|||
new ZhaopinCrawler(3), // 智联招聘爬虫,爬取3个职位
|
|||
new Job51Crawler(4) // 前程无忧爬虫,爬取4个职位
|
|||
}; |
|||
|
|||
// 数据处理器和存储
|
|||
DataProcessor processor = new DataProcessor(); |
|||
DataStorage storage = new DataStorage(); |
|||
|
|||
System.out.println("====== 大数据招聘爬虫系统 ======"); |
|||
System.out.println(); |
|||
|
|||
// 遍历所有爬虫,执行爬取(多态:调用子类的crawlJobs方法)
|
|||
for (BaseCrawler crawler : crawlers) { |
|||
System.out.println("正在使用 " + crawler.getName() + " 爬取数据..."); |
|||
|
|||
// 1. 爬取数据
|
|||
List<Map<String, String>> jobs = crawler.crawlJobs(); |
|||
System.out.println(crawler.getName() + " 爬取到 " + jobs.size() + " 个职位"); |
|||
|
|||
// 2. 处理数据
|
|||
List<Map<String, String>> processedJobs = processor.processData(jobs); |
|||
|
|||
// 3. 存储数据
|
|||
String filePath = crawler.getName() + "_jobs.csv"; |
|||
storage.saveToCsv(processedJobs, filePath); |
|||
System.out.println("数据已保存到: " + filePath); |
|||
System.out.println(); |
|||
} |
|||
|
|||
System.out.println("====== 所有爬虫任务完成 ======"); |
|||
|
|||
} catch (Exception e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
} |
|||
Binary file not shown.
@ -0,0 +1,64 @@ |
|||
package com.example.crawler; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
/** |
|||
* 爬虫基类,定义通用的爬虫接口和方法 |
|||
*/ |
|||
public abstract class BaseCrawler { |
|||
protected String name; // 爬虫名称
|
|||
protected int maxJobs; // 最大爬取职位数
|
|||
|
|||
/** |
|||
* 构造方法 |
|||
* @param name 爬虫名称 |
|||
* @param maxJobs 最大爬取职位数 |
|||
*/ |
|||
public BaseCrawler(String name, int maxJobs) { |
|||
this.name = name; |
|||
this.maxJobs = maxJobs; |
|||
} |
|||
|
|||
/** |
|||
* 爬取职位信息的抽象方法,子类必须实现 |
|||
* @return 职位信息列表 |
|||
* @throws IOException 网络请求异常 |
|||
*/ |
|||
public abstract List<Map<String, String>> crawlJobs() throws IOException; |
|||
|
|||
/** |
|||
* 获取爬虫名称 |
|||
* @return 爬虫名称 |
|||
*/ |
|||
public String getName() { |
|||
return name; |
|||
} |
|||
|
|||
/** |
|||
* 获取最大爬取职位数 |
|||
* @return 最大爬取职位数 |
|||
*/ |
|||
public int getMaxJobs() { |
|||
return maxJobs; |
|||
} |
|||
|
|||
/** |
|||
* 打印爬取开始信息 |
|||
*/ |
|||
protected void printStartInfo() { |
|||
System.out.println("====== " + name + " 开始爬取 ======"); |
|||
System.out.println("最大爬取职位数: " + maxJobs); |
|||
} |
|||
|
|||
/** |
|||
* 打印爬取结束信息 |
|||
* @param jobCount 实际爬取的职位数 |
|||
*/ |
|||
protected void printEndInfo(int jobCount) { |
|||
System.out.println("====== " + name + " 爬取完成 ======"); |
|||
System.out.println("总共成功爬取 " + jobCount + " 个职位"); |
|||
System.out.println(); |
|||
} |
|||
} |
|||
@ -0,0 +1,94 @@ |
|||
package com.example.crawler; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.HashMap; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
/** |
|||
* 前程无忧爬虫(模拟实现) |
|||
*/ |
|||
public class Job51Crawler extends BaseCrawler { |
|||
|
|||
/** |
|||
* 构造方法 |
|||
* @param maxJobs 最大爬取职位数 |
|||
*/ |
|||
public Job51Crawler(int maxJobs) { |
|||
super("前程无忧爬虫", maxJobs); |
|||
} |
|||
|
|||
@Override |
|||
public List<Map<String, String>> crawlJobs() throws IOException { |
|||
List<Map<String, String>> jobs = new ArrayList<>(); |
|||
|
|||
printStartInfo(); |
|||
|
|||
try { |
|||
// 模拟前程无忧爬取过程
|
|||
System.out.println("正在连接前程无忧网站..."); |
|||
|
|||
// 模拟网络延迟
|
|||
Thread.sleep(1200); |
|||
|
|||
// 模拟解析页面
|
|||
System.out.println("正在解析职位信息..."); |
|||
|
|||
// 模拟数据
|
|||
String[] titles = { |
|||
"大数据开发", "数据分析师", "数据挖掘工程师", |
|||
"大数据架构师", "AI算法工程师", "数据仓库开发" |
|||
}; |
|||
|
|||
String[] companies = { |
|||
"华为", "小米", "OPPO", |
|||
"vivo", "荣耀", "realme" |
|||
}; |
|||
|
|||
String[] salaries = { |
|||
"20-35K", "15-25K", "25-45K", |
|||
"30-55K", "28-50K", "18-30K" |
|||
}; |
|||
|
|||
String[] locations = { |
|||
"深圳", "北京", "上海", |
|||
"深圳", "北京", "深圳" |
|||
}; |
|||
|
|||
String[] experiences = { |
|||
"3-5年", "1-3年", "3-5年", |
|||
"5-10年", "3-5年", "2-4年" |
|||
}; |
|||
|
|||
String[] educations = { |
|||
"本科", "本科", "硕士", |
|||
"硕士", "硕士", "本科" |
|||
}; |
|||
|
|||
// 创建模拟的职位数据
|
|||
int count = 0; |
|||
for (int i = 0; i < titles.length && count < maxJobs; i++) { |
|||
Map<String, String> job = new HashMap<>(); |
|||
job.put("title", titles[i]); |
|||
job.put("salary", salaries[i]); |
|||
job.put("company", companies[i]); |
|||
job.put("location", locations[i]); |
|||
job.put("experience", experiences[i]); |
|||
job.put("education", educations[i]); |
|||
|
|||
jobs.add(job); |
|||
count++; |
|||
System.out.println("成功解析第 " + count + " 个职位: " + titles[i]); |
|||
} |
|||
|
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
System.out.println("爬取过程被中断"); |
|||
} |
|||
|
|||
printEndInfo(jobs.size()); |
|||
|
|||
return jobs; |
|||
} |
|||
} |
|||
@ -0,0 +1,88 @@ |
|||
package com.example.crawler; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.HashMap; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
/** |
|||
* 模拟数据爬虫,用于演示 |
|||
*/ |
|||
public class MockCrawler extends BaseCrawler { |
|||
|
|||
/** |
|||
* 构造方法 |
|||
* @param maxJobs 最大爬取职位数 |
|||
*/ |
|||
public MockCrawler(int maxJobs) { |
|||
super("模拟数据爬虫", maxJobs); |
|||
} |
|||
|
|||
@Override |
|||
public List<Map<String, String>> crawlJobs() throws IOException { |
|||
List<Map<String, String>> jobs = new ArrayList<>(); |
|||
|
|||
printStartInfo(); |
|||
|
|||
// 模拟爬取过程,实际项目中可以替换为真实的爬取逻辑
|
|||
String[] mockTitles = { |
|||
"大数据开发工程师", "数据分析师", "数据挖掘工程师", |
|||
"大数据架构师", "机器学习工程师", "数据仓库工程师", |
|||
"实时计算工程师", "大数据运维工程师", "数据产品经理", "算法工程师" |
|||
}; |
|||
|
|||
String[] mockCompanies = { |
|||
"阿里巴巴", "腾讯", "百度", "字节跳动", "京东", |
|||
"美团", "华为", "小米", "滴滴", "网易" |
|||
}; |
|||
|
|||
String[] mockSalaries = { |
|||
"20-35K", "15-25K", "25-40K", "30-50K", "18-30K", |
|||
"20-35K", "35-55K", "22-38K", "25-45K", "28-50K" |
|||
}; |
|||
|
|||
String[] mockLocations = { |
|||
"北京", "上海", "深圳", "杭州", "广州", |
|||
"北京", "深圳", "上海", "杭州", "北京" |
|||
}; |
|||
|
|||
String[] mockExperiences = { |
|||
"3-5年", "1-3年", "5-10年", "10年以上", "3-5年", |
|||
"5-10年", "10年以上", "3-5年", "5-10年", "3-5年" |
|||
}; |
|||
|
|||
String[] mockEducations = { |
|||
"本科", "本科", "硕士", "硕士", "本科", |
|||
"硕士", "博士", "本科", "硕士", "硕士" |
|||
}; |
|||
|
|||
// 模拟网络延迟
|
|||
try { |
|||
Thread.sleep(1000); |
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
} |
|||
|
|||
// 创建模拟的职位数据
|
|||
int count = 0; |
|||
for (int i = 0; i < mockTitles.length && count < maxJobs; i++) { |
|||
Map<String, String> job = new HashMap<>(); |
|||
job.put("title", mockTitles[i]); |
|||
job.put("salary", mockSalaries[i]); |
|||
job.put("company", mockCompanies[i]); |
|||
job.put("location", mockLocations[i]); |
|||
job.put("experience", mockExperiences[i]); |
|||
job.put("education", mockEducations[i]); |
|||
|
|||
jobs.add(job); |
|||
count++; |
|||
System.out.println("成功解析第 " + count + " 个职位: " + mockTitles[i]); |
|||
} |
|||
|
|||
printEndInfo(jobs.size()); |
|||
System.out.println("注意:这是模拟数据,实际项目中需要替换为真实的爬取逻辑"); |
|||
|
|||
return jobs; |
|||
} |
|||
} |
|||
Binary file not shown.
@ -0,0 +1,181 @@ |
|||
package com.example.crawler; |
|||
|
|||
|
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.HashMap; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
/** |
|||
* 智联招聘爬虫(真实实现) |
|||
*/ |
|||
public class ZhaopinCrawler extends BaseCrawler { |
|||
|
|||
/** |
|||
* 构造方法 |
|||
* @param maxJobs 最大爬取职位数 |
|||
*/ |
|||
public ZhaopinCrawler(int maxJobs) { |
|||
super("智联招聘爬虫", maxJobs); |
|||
} |
|||
|
|||
@Override |
|||
public List<Map<String, String>> crawlJobs() throws IOException { |
|||
List<Map<String, String>> jobs = new ArrayList<>(); |
|||
|
|||
printStartInfo(); |
|||
|
|||
try { |
|||
// 智联招聘搜索URL(搜索大数据相关职位)
|
|||
String url = "https://sou.zhaopin.com/?jl=530&kw=大数据&kt=3"; |
|||
|
|||
System.out.println("正在连接智联招聘网站..."); |
|||
|
|||
// 设置请求头,模拟浏览器访问
|
|||
Document doc = Jsoup.connect(url) |
|||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") |
|||
.timeout(10000) |
|||
.get(); |
|||
|
|||
System.out.println("正在解析职位信息..."); |
|||
|
|||
// 打印页面标题,确认是否成功获取页面
|
|||
System.out.println("页面标题: " + doc.title()); |
|||
|
|||
// 查找职位列表 - 尝试不同的选择器
|
|||
Elements jobElements = doc.select(".joblist__list"); |
|||
if (jobElements.isEmpty()) { |
|||
jobElements = doc.select(".list__module"); |
|||
} |
|||
if (jobElements.isEmpty()) { |
|||
jobElements = doc.select("div[class*=job]"); |
|||
} |
|||
|
|||
System.out.println("找到职位元素数量: " + jobElements.size()); |
|||
|
|||
// 如果找到的是容器,再查找内部的职位项
|
|||
if (!jobElements.isEmpty() && jobElements.size() == 1) { |
|||
Elements innerJobs = jobElements.first().select("div[class*=job]"); |
|||
if (!innerJobs.isEmpty()) { |
|||
jobElements = innerJobs; |
|||
System.out.println("找到内部职位项数量: " + jobElements.size()); |
|||
} |
|||
} |
|||
|
|||
int count = 0; |
|||
for (Element jobElement : jobElements) { |
|||
if (count >= maxJobs) break; |
|||
|
|||
Map<String, String> job = new HashMap<>(); |
|||
|
|||
// 提取职位名称 - 尝试多种选择器
|
|||
Element titleElement = jobElement.selectFirst("a[class*=job-name]"); |
|||
if (titleElement == null) { |
|||
titleElement = jobElement.selectFirst("a[class*=jobName]"); |
|||
} |
|||
if (titleElement == null) { |
|||
titleElement = jobElement.selectFirst("h3"); |
|||
} |
|||
if (titleElement != null) { |
|||
job.put("title", titleElement.text().trim()); |
|||
} |
|||
|
|||
// 提取薪资
|
|||
Element salaryElement = jobElement.selectFirst("span[class*=salary]"); |
|||
if (salaryElement == null) { |
|||
salaryElement = jobElement.selectFirst(".salary"); |
|||
} |
|||
if (salaryElement != null) { |
|||
job.put("salary", salaryElement.text().trim()); |
|||
} |
|||
|
|||
// 提取公司名称
|
|||
Element companyElement = jobElement.selectFirst("a[class*=company]"); |
|||
if (companyElement == null) { |
|||
companyElement = jobElement.selectFirst(".company"); |
|||
} |
|||
if (companyElement != null) { |
|||
job.put("company", companyElement.text().trim()); |
|||
} |
|||
|
|||
// 提取地点、经验、学历
|
|||
Elements infoElements = jobElement.select(".job-info"); |
|||
if (infoElements.isEmpty()) { |
|||
infoElements = jobElement.select("div[class*=info]"); |
|||
} |
|||
if (!infoElements.isEmpty()) { |
|||
Elements spans = infoElements.first().select("span"); |
|||
if (spans.size() >= 3) { |
|||
job.put("location", spans.get(0).text().trim()); |
|||
job.put("experience", spans.get(1).text().trim()); |
|||
job.put("education", spans.get(2).text().trim()); |
|||
} |
|||
} |
|||
|
|||
// 确保所有字段都有值
|
|||
job.putIfAbsent("title", ""); |
|||
job.putIfAbsent("salary", ""); |
|||
job.putIfAbsent("company", ""); |
|||
job.putIfAbsent("location", ""); |
|||
job.putIfAbsent("experience", ""); |
|||
job.putIfAbsent("education", ""); |
|||
|
|||
// 只添加有效的职位
|
|||
if (!job.get("title").isEmpty()) { |
|||
jobs.add(job); |
|||
count++; |
|||
System.out.println("成功解析第 " + count + " 个职位: " + job.get("title")); |
|||
System.out.println("薪资: " + job.get("salary") + ", 公司: " + job.get("company")); |
|||
System.out.println("地点: " + job.get("location") + ", 经验: " + job.get("experience") + ", 学历: " + job.get("education")); |
|||
|
|||
// 模拟网络延迟,避免被反爬
|
|||
Thread.sleep(1000); |
|||
} |
|||
} |
|||
|
|||
// 如果没有找到职位,尝试直接解析页面内容
|
|||
if (jobs.isEmpty()) { |
|||
System.out.println("尝试直接解析页面内容..."); |
|||
// 查找所有包含职位信息的元素
|
|||
Elements allElements = doc.select("div[class*=item]"); |
|||
System.out.println("找到项目元素数量: " + allElements.size()); |
|||
|
|||
for (Element element : allElements) { |
|||
if (count >= maxJobs) break; |
|||
|
|||
String text = element.text(); |
|||
if (text.contains("大数据") && (text.contains("K") || text.contains("元"))) { |
|||
Map<String, String> job = new HashMap<>(); |
|||
job.put("title", text.substring(0, Math.min(50, text.length()))); |
|||
job.put("salary", ""); |
|||
job.put("company", ""); |
|||
job.put("location", ""); |
|||
job.put("experience", ""); |
|||
job.put("education", ""); |
|||
|
|||
jobs.add(job); |
|||
count++; |
|||
System.out.println("成功解析第 " + count + " 个职位: " + job.get("title")); |
|||
} |
|||
} |
|||
} |
|||
|
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
System.out.println("爬取过程被中断"); |
|||
} catch (IOException e) { |
|||
System.out.println("网络请求失败: " + e.getMessage()); |
|||
// 如果网络请求失败,返回空列表
|
|||
} |
|||
|
|||
printEndInfo(jobs.size()); |
|||
|
|||
return jobs; |
|||
} |
|||
} |
|||
@ -0,0 +1,21 @@ |
|||
package com.example.processor; |
|||
|
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
public class DataProcessor { |
|||
// 处理爬取的数据
|
|||
public List<Map<String, String>> processData(List<Map<String, String>> jobs) { |
|||
for (Map<String, String> job : jobs) { |
|||
// 清理空白字符
|
|||
job.replaceAll((k, v) -> v != null ? v.trim() : ""); |
|||
// 处理薪资格式(简化处理)
|
|||
String salary = job.get("salary"); |
|||
if (salary != null && !salary.isEmpty()) { |
|||
// 保留原始薪资信息
|
|||
job.put("salary", salary); |
|||
} |
|||
} |
|||
return jobs; |
|||
} |
|||
} |
|||
@ -0,0 +1,31 @@ |
|||
package com.example.storage; |
|||
|
|||
import java.io.BufferedWriter; |
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
public class DataStorage { |
|||
// 将数据保存为CSV文件
|
|||
public void saveToCsv(List<Map<String, String>> jobs, String filePath) throws IOException { |
|||
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filePath))) { |
|||
// 写入表头
|
|||
writer.write("职位名称,薪资,公司名称,地点,经验要求,学历要求"); |
|||
writer.newLine(); |
|||
|
|||
// 写入数据
|
|||
for (Map<String, String> job : jobs) { |
|||
writer.write(String.join(",", |
|||
job.getOrDefault("title", ""), |
|||
job.getOrDefault("salary", ""), |
|||
job.getOrDefault("company", ""), |
|||
job.getOrDefault("location", ""), |
|||
job.getOrDefault("experience", ""), |
|||
job.getOrDefault("education", "") |
|||
)); |
|||
writer.newLine(); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
|
@ -0,0 +1,7 @@ |
|||
com\example\Main.class |
|||
com\example\processor\DataProcessor.class |
|||
com\example\crawler\MockCrawler.class |
|||
com\example\crawler\Job51Crawler.class |
|||
com\example\storage\DataStorage.class |
|||
com\example\crawler\BaseCrawler.class |
|||
com\example\crawler\ZhaopinCrawler.class |
|||
@ -0,0 +1,7 @@ |
|||
C:\Users\lenovo\Desktop\Code\java\大数据招聘爬虫\PaChong\src\main\java\com\example\Main.java |
|||
C:\Users\lenovo\Desktop\Code\java\大数据招聘爬虫\PaChong\src\main\java\com\example\crawler\BaseCrawler.java |
|||
C:\Users\lenovo\Desktop\Code\java\大数据招聘爬虫\PaChong\src\main\java\com\example\crawler\Job51Crawler.java |
|||
C:\Users\lenovo\Desktop\Code\java\大数据招聘爬虫\PaChong\src\main\java\com\example\crawler\MockCrawler.java |
|||
C:\Users\lenovo\Desktop\Code\java\大数据招聘爬虫\PaChong\src\main\java\com\example\storage\DataStorage.java |
|||
C:\Users\lenovo\Desktop\Code\java\大数据招聘爬虫\PaChong\src\main\java\com\example\crawler\ZhaopinCrawler.java |
|||
C:\Users\lenovo\Desktop\Code\java\大数据招聘爬虫\PaChong\src\main\java\com\example\processor\DataProcessor.java |
|||
|
|
|
Loading…
Reference in new issue