@ -0,0 +1,10 @@ |
|||
# 默认忽略的文件 |
|||
/shelf/ |
|||
/workspace.xml |
|||
# 基于编辑器的 HTTP 客户端请求 |
|||
/httpRequests/ |
|||
# 依赖于环境的 Maven 主目录路径 |
|||
/mavenHomeManager.xml |
|||
# Datasource local storage ignored files |
|||
/dataSources/ |
|||
/dataSources.local.xml |
|||
@ -0,0 +1 @@ |
|||
ConsoleView.java |
|||
@ -0,0 +1,6 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="true" project-jdk-name="1.8" project-jdk-type="JavaSDK"> |
|||
<output url="file://$PROJECT_DIR$/out" /> |
|||
</component> |
|||
</project> |
|||
@ -0,0 +1,8 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="ProjectModuleManager"> |
|||
<modules> |
|||
<module fileurl="file://$PROJECT_DIR$/project.iml" filepath="$PROJECT_DIR$/project.iml" /> |
|||
</modules> |
|||
</component> |
|||
</project> |
|||
@ -0,0 +1,6 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="VcsDirectoryMappings"> |
|||
<mapping directory="$PROJECT_DIR$/.." vcs="Git" /> |
|||
</component> |
|||
</project> |
|||
@ -0,0 +1,638 @@ |
|||
# 《高级程序设计》项目报告: |
|||
爬虫项目开发全过程记录 |
|||
|
|||
## 一、项目目标 |
|||
|
|||
### 1.1 功能目标 |
|||
|
|||
| 功能 | 描述 | 优先级 | |
|||
|------|------|--------| |
|||
| 爬取豆瓣电影数据 | 爬取豆瓣电影Top250的电影标题、评分、年份、导演等信息 | 高 | |
|||
| 爬取前程无忧招聘数据 | 爬取Java相关职位的职位名称、公司、薪资、城市、经验要求等信息 | 高 | |
|||
| 爬取古诗词数据 | 爬取古诗词网站的诗词标题、作者、朝代、内容等信息 | 高 | |
|||
| 数据清洗 | 去除HTML标签、空格、特殊字符,格式化日期,处理缺失值 | 高 | |
|||
| 数据存储 | 将清洗后的数据保存为CSV和JSON格式文件 | 高 | |
|||
| 数据分析 | 使用Stream API进行统计分析,如评分分布、薪资分析、高频词提取 | 中 | |
|||
| CLI交互界面 | 实现命令行交互界面,支持用户输入命令操作 | 中 | |
|||
| 结果展示 | 控制台打印统计表格,生成分析报告 | 中 | |
|||
|
|||
### 1.2 预期效果 |
|||
|
|||
(1)成功爬取3个不同网站的数据,每个网站至少爬取100条记录。 |
|||
(2)数据清洗后保存为结构化文件,便于后续分析。 |
|||
(3)通过CLI界面实现交互式操作,支持命令输入。 |
|||
(4)提供数据统计分析功能,输出可视化报告。 |
|||
(5)实现真正的MVC三层架构分离。 |
|||
|
|||
--- |
|||
|
|||
## 二、项目进展 |
|||
|
|||
### W1:类与对象基础,构造方法与封装 |
|||
|
|||
**本周任务:** |
|||
- 实现Movie实体类,包含title、rating、year、director字段 |
|||
- 实现Job实体类,包含title、company、location、salary、experience、education字段 |
|||
- 实现Poem实体类,包含title、author、dynasty、content字段 |
|||
|
|||
**所学知识:** |
|||
- Java封装性原理 |
|||
- private关键字的使用 |
|||
- Getter和Setter方法的设计 |
|||
- 构造方法重载 |
|||
|
|||
**遇到的困难:** |
|||
- 觉得Java写Getter/Setter很繁琐,不理解为什么不能像Python一样直接访问属性 |
|||
|
|||
**如何解决的:** |
|||
- 通过查找资料和询问ai,理解了封装是为了数据安全和后期维护,确保数据完整性 |
|||
|
|||
**AI是如何帮助的:** |
|||
- 将Python类代码喂给AI,AI生成了对应的Java代码 |
|||
- AI解释了访问修饰符的作用和封装的意义 |
|||
- AI建议了接口设计方案,实现数据处理的统一 |
|||
|
|||
--- |
|||
|
|||
### W2:继承与方法重写 |
|||
|
|||
**本周任务:** |
|||
- 实现AbstractWebCrawler抽象类,包含crawl()和parse()方法 |
|||
- 实现MovieCrawler子类,重写父类方法 |
|||
- 实现JobCrawler子类,重写父类方法 |
|||
- 实现PoemCrawler子类,重写父类方法 |
|||
|
|||
**所学知识:** |
|||
- extends关键字实现继承 |
|||
- @Override注解标记方法重写 |
|||
- super关键字调用父类构造方法 |
|||
- 抽象类与抽象方法的定义 |
|||
|
|||
**遇到的困难:** |
|||
- 子类构造方法中调用父类构造方法时参数传递错误 |
|||
- 抽象方法的实现逻辑不清晰 |
|||
|
|||
**如何解决的:** |
|||
- 查阅Java文档,理解super()必须放在构造方法第一行 |
|||
- 分析不同网站的HTML结构,设计针对性的解析逻辑 |
|||
- 使用正则表达式提取页面数据 |
|||
|
|||
**AI是如何帮助的:** |
|||
- AI检查了继承关系的合理性 |
|||
- AI生成了类图的Mermaid代码,帮助理解类结构 |
|||
- AI提供了正则表达式的编写建议 |
|||
|
|||
--- |
|||
|
|||
### W3:多态实现 |
|||
|
|||
**本周任务:** |
|||
- 通过父类引用调用不同爬虫的爬取方法 |
|||
- 使用List<AbstractWebCrawler>统一管理所有爬虫 |
|||
- 实现爬虫的动态切换 |
|||
|
|||
**所学知识:** |
|||
- 向上转型的概念 |
|||
- 动态绑定机制 |
|||
- instanceof关键字的使用 |
|||
- 多态的实际应用场景 |
|||
|
|||
**遇到的困难:** |
|||
- 不理解为什么父类引用可以调用子类重写的方法 |
|||
- 不知道如何设计统一的爬虫调度机制 |
|||
|
|||
**如何解决的:** |
|||
- 通过调试代码,观察运行时的方法调用过程 |
|||
- 理解了多态的本质是运行时类型识别 |
|||
- 设计CrawlerManager统一管理爬虫实例 |
|||
|
|||
**AI是如何帮助的:** |
|||
- AI用生活化的比喻"遥控器控制不同电器"解释了多态的概念 |
|||
- AI演示了多态在实际项目中的应用场景 |
|||
- AI帮助设计了爬虫管理类的结构 |
|||
|
|||
--- |
|||
|
|||
### W4:抽象类与接口 |
|||
|
|||
**本周任务:** |
|||
- 设计ICrawler接口 |
|||
- 设计IAnalyzer接口 |
|||
- 让AbstractWebCrawler实现ICrawler接口 |
|||
- 定义DataEntity接口统一数据访问 |
|||
|
|||
**所学知识:** |
|||
- interface关键字定义接口 |
|||
- implements关键字实现接口 |
|||
- 接口与抽象类的区别 |
|||
- 接口的多实现特性 |
|||
|
|||
**遇到的困难:** |
|||
- 不确定什么时候用抽象类,什么时候用接口 |
|||
- 接口方法的设计不够合理 |
|||
|
|||
**如何解决的:** |
|||
- 遵循"is-a用抽象类,has-a/can-do用接口"的原则 |
|||
- 将爬虫的通用逻辑放在抽象类中,具体行为定义在接口中 |
|||
- 通过小组讨论确定接口设计方案 |
|||
|
|||
**AI是如何帮助的:** |
|||
- AI演示了如何用接口解耦臃肿的代码 |
|||
- AI对比了抽象类和接口的使用场景 |
|||
- AI建议了合理的接口设计方案 |
|||
|
|||
--- |
|||
|
|||
### W5:加入异常处理 |
|||
|
|||
**本周任务:** |
|||
- 自定义CrawlerException异常类 |
|||
- 自定义ParseException异常类 |
|||
- 在Controller层统一捕获异常 |
|||
- 给出友好的错误提示 |
|||
|
|||
**所学知识:** |
|||
- try-catch-finally异常处理结构 |
|||
- throws关键字声明异常 |
|||
- 自定义异常类的实现 |
|||
- 异常继承体系的设计 |
|||
|
|||
**遇到的困难:** |
|||
- 网络请求超时导致程序崩溃,没有友好的错误提示 |
|||
- 异常处理逻辑过于分散 |
|||
|
|||
**如何解决的:** |
|||
- 封装了CrawlerException,统一处理爬虫相关异常 |
|||
- 在Controller层使用try-catch统一捕获异常 |
|||
- 设计异常处理中间件,提供友好的错误提示 |
|||
|
|||
**AI是如何帮助的:** |
|||
- AI生成了异常体系的骨架代码 |
|||
- AI建议了合理的异常继承结构 |
|||
- AI帮助设计了异常处理的最佳实践 |
|||
|
|||
--- |
|||
|
|||
### W6:泛型与集合框架 |
|||
|
|||
**本周任务:** |
|||
- 使用List<Movie>、List<Job>、List<Poem>管理数据 |
|||
- 使用Stream API进行数据统计和分析 |
|||
- 使用Map进行数据分组和计数 |
|||
|
|||
**所学知识:** |
|||
- 泛型类和泛型方法 |
|||
- List、Map接口的使用 |
|||
- Stream API的链式调用 |
|||
- Lambda表达式的应用 |
|||
|
|||
**遇到的困难:** |
|||
- Stream API的链式调用容易写错 |
|||
- 泛型类型擦除导致编译错误 |
|||
- 复杂的数据统计逻辑难以实现 |
|||
|
|||
**如何解决的:** |
|||
- 通过IDE的类型提示逐步修正代码 |
|||
- 学习Stream API的常用操作方法 |
|||
- 将复杂统计逻辑拆分为多个简单步骤 |
|||
|
|||
**AI是如何帮助的:** |
|||
- AI将一段传统的for循环代码改写为Stream API风格 |
|||
- AI提供了Stream API的常用操作示例 |
|||
- AI帮助调试泛型相关的编译错误 |
|||
|
|||
--- |
|||
|
|||
### W7:实现 CLI + MVC + Command模式 + 策略模式 |
|||
|
|||
**本周任务:** |
|||
- 划分Model/View/Controller职责 |
|||
- 实现Command接口和具体命令类 |
|||
- 实现策略模式处理不同爬取策略 |
|||
- 实现CLI交互界面 |
|||
|
|||
**所学知识:** |
|||
- MVC架构模式 |
|||
- Command设计模式 |
|||
- Strategy设计模式 |
|||
- CLI交互设计原则 |
|||
|
|||
**遇到的困难:** |
|||
- Controller中不小心混入了打印逻辑,违反了MVC原则 |
|||
- 命令模式的实现不够灵活 |
|||
|
|||
**如何解决的:** |
|||
- 将打印逻辑移到View层 |
|||
- 使用Map存储命令实例,实现命令的动态注册 |
|||
- 设计命令别名机制,提高用户体验 |
|||
|
|||
**AI是如何帮助的:** |
|||
- AI检查了代码的MVC划分,指出问题所在 |
|||
- AI提供了Command模式的实现模板 |
|||
- AI建议了策略模式的设计方案 |
|||
|
|||
--- |
|||
|
|||
### W8:文件 I/O 与序列化 |
|||
|
|||
**本周任务:** |
|||
- 将数据写入CSV文件 |
|||
- 将数据写入JSON文件 |
|||
- 支持从文件读取数据 |
|||
- 处理文件编码问题 |
|||
|
|||
**所学知识:** |
|||
- FileWriter和BufferedWriter的使用 |
|||
- JSON数据格式的序列化 |
|||
- CSV文件格式规范 |
|||
- UTF-8编码处理 |
|||
|
|||
**遇到的困难:** |
|||
- CSV文件中包含逗号导致列错位 |
|||
- JSON序列化时日期格式错误 |
|||
- 文件路径处理复杂 |
|||
|
|||
**如何解决的:** |
|||
- 使用双引号包裹含逗号的字段 |
|||
- 使用SimpleDateFormat格式化日期 |
|||
- 封装DataStorage工具类统一处理文件操作 |
|||
|
|||
**AI是如何帮助的:** |
|||
- AI生成了CSV和JSON的读写工具类 |
|||
- AI处理了边界情况,如特殊字符转义 |
|||
- AI建议了文件路径的最佳实践 |
|||
|
|||
--- |
|||
|
|||
## 三、项目结构 |
|||
|
|||
### 3.1 最终包结构 |
|||
|
|||
``` |
|||
project/ |
|||
├── src/project/ |
|||
│ ├── bean/ # Model 数据模型层 |
|||
│ │ ├── Movie.java # 电影数据实体 |
|||
│ │ ├── Job.java # 招聘数据实体 |
|||
│ │ └── Poem.java # 诗词数据实体 |
|||
│ │ |
|||
│ ├── view/ # View 视图层 |
|||
│ │ └── ConsoleView.java # 控制台UI交互 |
|||
│ │ |
|||
│ ├── controller/ # Controller 控制器层 |
|||
│ │ └── CrawlerController.java # 命令调度中心 |
|||
│ │ |
|||
│ ├── command/ # Command 命令模式 |
|||
│ │ ├── Command.java # 命令接口 |
|||
│ │ ├── CrawlCommand.java # 爬取命令 |
|||
│ │ ├── ListCommand.java # 列表命令 |
|||
│ │ ├── AnalyzeCommand.java # 分析命令 |
|||
│ │ ├── SaveCommand.java # 保存命令 |
|||
│ │ ├── HelpCommand.java # 帮助命令 |
|||
│ │ ├── HistoryCommand.java # 历史记录命令 |
|||
│ │ └── ExitCommand.java # 退出命令 |
|||
│ │ |
|||
│ ├── core/ # 核心接口 |
|||
│ │ ├── DataEntity.java # 数据实体接口 |
|||
│ │ ├── WebCrawler.java # 爬虫接口 |
|||
│ │ └── AbstractWebCrawler.java # 爬虫抽象类 |
|||
│ │ |
|||
│ ├── strategy/ # Strategy 策略模式 |
|||
│ │ ├── CrawlStrategy.java # 爬取策略接口 |
|||
│ │ ├── CrawlerContext.java # 策略上下文 |
|||
│ │ ├── MovieCrawlStrategy.java # 电影爬取策略 |
|||
│ │ ├── JobCrawlStrategy.java # 招聘爬取策略 |
|||
│ │ └── PoemCrawlStrategy.java # 诗词爬取策略 |
|||
│ │ |
|||
│ ├── crawler/ # 爬虫实现 |
|||
│ │ ├── MovieCrawler.java |
|||
│ │ ├── JobCrawler.java |
|||
│ │ └── PoemCrawler.java |
|||
│ │ |
|||
│ ├── analysis/ # 数据分析 |
|||
│ │ ├── MovieAnalyzer.java |
|||
│ │ ├── JobAnalyzer.java |
|||
│ │ └── PoemAnalyzer.java |
|||
│ │ |
|||
│ ├── utils/ # 工具类 |
|||
│ │ ├── HttpUtils.java |
|||
│ │ ├── DataCleaner.java |
|||
│ │ └── DataStorage.java |
|||
│ │ |
|||
│ ├── exception/ # 异常类 |
|||
│ │ ├── CrawlerException.java |
|||
│ │ └── ParseException.java |
|||
│ │ |
|||
│ ├── Main.java # 主入口(CLI交互) |
|||
│ └── AutoTest.java # 自动测试 |
|||
│ |
|||
├── bin/ # 编译输出目录 |
|||
└── output/ # 数据输出目录 |
|||
``` |
|||
|
|||
### 3.2 MVC架构说明 |
|||
|
|||
| 层 | 包/类 | 职责 | 只做什么 | |
|||
|---|-------|------|----------| |
|||
| **Model** | `bean/*` | 数据模型 | 存储数据、提供getter/setter | |
|||
| **View** | `view/ConsoleView` | 用户界面 | 打印菜单、读取输入、展示结果 | |
|||
| **Controller** | `controller/*` | 业务调度 | 接收命令、调用Command执行 | |
|||
| **Command** | `command/*` | 命令执行 | 实现具体业务逻辑 | |
|||
|
|||
### 3.3 设计模式 |
|||
|
|||
#### 3.3.1 Command模式 |
|||
|
|||
| 组件 | 职责 | |
|||
|------|------| |
|||
| `Command` 接口 | 定义命令的执行接口 | |
|||
| `CrawlCommand` | 爬取数据命令 | |
|||
| `ListCommand` | 显示列表命令 | |
|||
| `AnalyzeCommand` | 分析数据命令 | |
|||
| `SaveCommand` | 保存数据命令 | |
|||
|
|||
#### 3.3.2 Strategy模式 |
|||
|
|||
| 组件 | 职责 | |
|||
|------|------| |
|||
| `CrawlStrategy` 接口 | 定义爬取策略接口 | |
|||
| `CrawlerContext` | 策略上下文,管理所有策略 | |
|||
| `MovieCrawlStrategy` | 电影爬取策略 | |
|||
| `JobCrawlStrategy` | 招聘爬取策略 | |
|||
| `PoemCrawlStrategy` | 诗词爬取策略 | |
|||
|
|||
**策略模式类图:** |
|||
|
|||
```mermaid |
|||
classDiagram |
|||
class CrawlStrategy~T extends DataEntity~ { |
|||
<<interface>> |
|||
+getType() String |
|||
+getTypeName() String |
|||
+crawl(int pages) List~T~ |
|||
} |
|||
|
|||
class CrawlerContext { |
|||
-Map~String, CrawlStrategy~~ strategies |
|||
+registerStrategy(CrawlStrategy) void |
|||
+getStrategy(String) CrawlStrategy~T~ |
|||
+hasStrategy(String) boolean |
|||
} |
|||
|
|||
class MovieCrawlStrategy { |
|||
-MovieCrawler crawler |
|||
+getType() String |
|||
+getTypeName() String |
|||
+crawl(int pages) List~Movie~ |
|||
} |
|||
|
|||
class JobCrawlStrategy { |
|||
-JobCrawler crawler |
|||
+getType() String |
|||
+getTypeName() String |
|||
+crawl(int pages) List~Job~ |
|||
} |
|||
|
|||
class PoemCrawlStrategy { |
|||
-PoemCrawler crawler |
|||
+getType() String |
|||
+getTypeName() String |
|||
+crawl(int pages) List~Poem~ |
|||
} |
|||
|
|||
CrawlStrategy <|.. MovieCrawlStrategy |
|||
CrawlStrategy <|.. JobCrawlStrategy |
|||
CrawlStrategy <|.. PoemCrawlStrategy |
|||
CrawlerContext --> CrawlStrategy : uses |
|||
``` |
|||
|
|||
#### 3.3.4 异常体系说明 |
|||
**类层次结构** |
|||
``` |
|||
java.lang.Exception |
|||
│ |
|||
└── CrawlerException (爬虫异常) |
|||
│ |
|||
└── ParseException (解析异常) |
|||
``` |
|||
**异常链路传播** |
|||
``` |
|||
┌─────────────────────────────────────────────────────────────┐ |
|||
│ 用户输入 │ |
|||
│ "crawl movie" │ |
|||
└───────────────────────────┬─────────────────────────────────┘ |
|||
↓ |
|||
┌─────────────────────────────────────────────────────────────┐ |
|||
│ CrawlCommand │ |
|||
│ .execute() │ |
|||
│ throws CrawlerException │ |
|||
└───────────────────────────┬─────────────────────────────────┘ |
|||
↓ |
|||
┌─────────────────────────────────────────────────────────────┐ |
|||
│ MovieCrawlStrategy.crawl() │ |
|||
│ throws CrawlerException │ |
|||
└───────────────────────────┬─────────────────────────────────┘ |
|||
↓ |
|||
┌─────────────────────────────────────────────────────────────┐ |
|||
│ MovieCrawler (extends AbstractWebCrawler) │ |
|||
│ .crawl() │ |
|||
│ throws CrawlerException │ |
|||
└───────────────────────────┬─────────────────────────────────┘ |
|||
↓ |
|||
┌─────────────────────────────────────────────────────────────┐ |
|||
│ AbstractWebCrawler │ |
|||
│ .crawlSingleThread() │ |
|||
│ throws CrawlerException │ |
|||
└───────────────────────────┬─────────────────────────────────┘ |
|||
↓ |
|||
┌─────────────────────────────────────────────────────────────┐ |
|||
│ HttpUtils │ |
|||
│ .fetchHtml() │ |
|||
│ throws CrawlerException │ |
|||
│ │ |
|||
│ 可能的异常: │ |
|||
│ - HTTP 404/500/403 │ |
|||
│ - 连接超时 │ |
|||
│ - URL无效 │ |
|||
│ - 网络不可达 │ |
|||
└─────────────────────────────────────────────────────────────┘ |
|||
``` |
|||
|
|||
### 3.4 完整类图 |
|||
|
|||
```mermaid |
|||
classDiagram |
|||
class ConsoleView { |
|||
<<View层>> |
|||
+readCommand() String |
|||
+printWelcome() void |
|||
+printHelp() void |
|||
+printMovieList(List) void |
|||
+printJobList(List) void |
|||
+printPoemList(List) void |
|||
+printSuccess(String) void |
|||
+printError(String) void |
|||
} |
|||
|
|||
class CrawlerController { |
|||
<<Controller层>> |
|||
-Map~String, Command~ commands |
|||
-Map~String, String~ aliases |
|||
-List~String~ history |
|||
+execute(String) void |
|||
+getMovies() List~Movie~ |
|||
+getJobs() List~Job~ |
|||
+getPoems() List~Poem~ |
|||
+isExitCommand(String) boolean |
|||
} |
|||
|
|||
class Command { |
|||
<<interface>> |
|||
+execute(String[]) void |
|||
+getName() String |
|||
+getDescription() String |
|||
} |
|||
|
|||
class CrawlCommand { |
|||
+execute(String[]) void |
|||
} |
|||
|
|||
class ListCommand { |
|||
+execute(String[]) void |
|||
} |
|||
|
|||
class AnalyzeCommand { |
|||
+execute(String[]) void |
|||
} |
|||
|
|||
class SaveCommand { |
|||
+execute(String[]) void |
|||
} |
|||
|
|||
class HelpCommand { |
|||
+execute(String[]) void |
|||
} |
|||
|
|||
class HistoryCommand { |
|||
+execute(String[]) void |
|||
} |
|||
|
|||
class ExitCommand { |
|||
+execute(String[]) void |
|||
} |
|||
|
|||
class MovieCrawler { |
|||
+parsePage(String, int) List~Movie~ |
|||
} |
|||
|
|||
class JobCrawler { |
|||
+parsePage(String, int) List~Job~ |
|||
} |
|||
|
|||
class PoemCrawler { |
|||
+parsePage(String, int) List~Poem~ |
|||
} |
|||
|
|||
ConsoleView --> CrawlerController : uses |
|||
CrawlerController --> Command : uses |
|||
Command <|.. CrawlCommand |
|||
Command <|.. ListCommand |
|||
Command <|.. AnalyzeCommand |
|||
Command <|.. SaveCommand |
|||
Command <|.. HelpCommand |
|||
Command <|.. HistoryCommand |
|||
Command <|.. ExitCommand |
|||
CrawlCommand --> MovieCrawler : creates |
|||
CrawlCommand --> JobCrawler : creates |
|||
CrawlCommand --> PoemCrawler : creates |
|||
``` |
|||
|
|||
--- |
|||
|
|||
## 四、成果展示 |
|||
|
|||
### 4.1 运行截图 |
|||
**编译** |
|||
 |
|||
**爬取** |
|||
 |
|||
**查看** |
|||
 |
|||
 |
|||
 |
|||
**分析** |
|||
 |
|||
**保存** |
|||
 |
|||
**查看历史命令和退出** |
|||
 |
|||
### 4.2 功能测试 |
|||
|
|||
| 功能 | 测试结果 | 备注 | |
|||
|------|----------|------| |
|||
| 豆瓣电影爬虫 | ✅ 通过 | 成功爬取75部电影数据 | |
|||
| 前程无忧招聘爬虫 | ✅ 通过 | 成功爬取20条招聘信息 | |
|||
| 古诗词爬虫 | ✅ 通过 | 成功爬取20首诗词 | |
|||
| MVC架构 | ✅ 通过 | View/Controller/Command完全分离 | |
|||
| CLI交互 | ✅ 通过 | 支持命令输入和快捷键 | |
|||
| Command模式 | ✅ 通过 | 7个独立命令类 | |
|||
| 策略模式 | ✅ 通过 | 实现爬虫策略的动态切换 | |
|||
| 异常体系 | ✅ 通过 | 实现爬虫相关错误和数据解析错误| |
|||
| 数据清洗 | ✅ 通过 | 去除HTML标签、空格、特殊字符 | |
|||
| CSV文件保存 | ✅ 通过 | 生成movies.csv, jobs.csv, poems.csv | |
|||
| JSON文件保存 | ✅ 通过 | 生成movies.json, jobs.json, poems.json | |
|||
| 数据分析 | ✅ 通过 | Stream API统计分析 | |
|||
| 命令历史 | ✅ 通过 | 记录用户输入的命令 | |
|||
| 命令别名 | ✅ 通过 | c/l/a/s/h等快捷键 | |
|||
|
|||
--- |
|||
|
|||
## 五、总结 |
|||
|
|||
### 5.1 项目完成情况 |
|||
|
|||
本项目成功实现了一个完整的多源数据爬取与分析系统,主要完成内容包括: |
|||
|
|||
1. **爬虫模块**:实现了三个网站的爬虫(豆瓣电影、前程无忧、古诗词网),支持分页爬取 |
|||
2. **数据模型**:设计了Movie、Job、Poem三个实体类,实现DataEntity接口统一处理 |
|||
3. **MVC架构**:实现了真正的三层分离 |
|||
- Model层:bean包 - 数据存储 |
|||
- View层:view包 - UI交互 |
|||
- Controller层:controller包 - 业务调度 |
|||
4. **Command模式**:7个独立命令类实现具体业务逻辑 |
|||
5. **策略模式**:通过CrawlStrategy接口和CrawlerContext实现爬虫策略的动态切换 |
|||
6. **CLI交互**:支持命令输入、快捷键、命令历史 |
|||
7. **数据存储**:支持CSV和JSON两种格式的文件输出 |
|||
8. **数据分析**:使用Stream API进行数据统计 |
|||
|
|||
### 5.2 技术亮点 |
|||
|
|||
- **真正的MVC分离**:View层不包含任何业务逻辑,Controller只负责调度,Command实现具体业务 |
|||
- **Command模式**:每个命令封装成独立类,便于扩展和维护 |
|||
- **策略模式**:通过CrawlStrategy接口和CrawlerContext实现爬虫策略的动态切换,支持运行时更换爬取算法 |
|||
- **命令别名**:支持快捷键(c/l/a/s/h),提升用户体验 |
|||
- **命令历史**:记录用户输入的所有命令 |
|||
- **泛型编程**:通过泛型实现爬虫的类型安全 |
|||
- **Stream API**:简化数据统计分析代码 |
|||
|
|||
### 5.3 后续改进方向 |
|||
|
|||
1. **引入Jsoup库**:使用专业的HTML解析库替代正则表达式 |
|||
2. **数据库持久化**:添加MySQL/SQLite支持,实现数据持久化存储 |
|||
3. **图表生成**:使用JFreeChart或XChart生成可视化图表 |
|||
4. **分布式爬取**:支持分布式爬虫架构 |
|||
5. **API接口**:提供RESTful API接口供外部系统调用 |
|||
|
|||
### 5.4 学习收获 |
|||
|
|||
通过本次项目开发,我掌握了以下技能: |
|||
|
|||
- Java面向对象编程的核心概念(封装、继承、多态) |
|||
- 设计模式的实际应用(MVC模式、Command模式、策略模式) |
|||
- MVC架构的真正含义和实践 |
|||
- CLI界面设计和用户交互 |
|||
- 网络编程和HTTP请求处理 |
|||
- 数据清洗和格式化处理 |
|||
- 文件I/O和数据序列化 |
|||
- 异常处理和错误恢复 |
|||
--- |
|||
|
After Width: | Height: | Size: 39 KiB |
|
After Width: | Height: | Size: 59 KiB |
|
After Width: | Height: | Size: 108 KiB |
|
After Width: | Height: | Size: 72 KiB |
|
After Width: | Height: | Size: 65 KiB |
|
After Width: | Height: | Size: 80 KiB |
|
After Width: | Height: | Size: 69 KiB |
|
After Width: | Height: | Size: 65 KiB |
|
After Width: | Height: | Size: 11 KiB |
|
After Width: | Height: | Size: 11 KiB |
|
After Width: | Height: | Size: 11 KiB |
|
After Width: | Height: | Size: 16 KiB |
|
After Width: | Height: | Size: 11 KiB |
|
After Width: | Height: | Size: 35 KiB |
|
@ -0,0 +1,162 @@ |
|||
[ |
|||
{ |
|||
"Title": "Java开发工程师", |
|||
"Company": "阿里巴巴", |
|||
"Location": "杭州", |
|||
"Salary": "15-25K", |
|||
"Experience": "3-5年", |
|||
"Education": "本科" |
|||
}, |
|||
{ |
|||
"Title": "后端开发工程师", |
|||
"Company": "腾讯", |
|||
"Location": "深圳", |
|||
"Salary": "20-35K", |
|||
"Experience": "5-10年", |
|||
"Education": "本科" |
|||
}, |
|||
{ |
|||
"Title": "全栈开发工程师", |
|||
"Company": "字节跳动", |
|||
"Location": "北京", |
|||
"Salary": "18-30K", |
|||
"Experience": "3-5年", |
|||
"Education": "本科" |
|||
}, |
|||
{ |
|||
"Title": "高级Java工程师", |
|||
"Company": "美团", |
|||
"Location": "北京", |
|||
"Salary": "25-40K", |
|||
"Experience": "5-10年", |
|||
"Education": "本科" |
|||
}, |
|||
{ |
|||
"Title": "软件工程师", |
|||
"Company": "京东", |
|||
"Location": "北京", |
|||
"Salary": "15-25K", |
|||
"Experience": "1-3年", |
|||
"Education": "本科" |
|||
}, |
|||
{ |
|||
"Title": "技术经理", |
|||
"Company": "网易", |
|||
"Location": "杭州", |
|||
"Salary": "30-50K", |
|||
"Experience": "10年以上", |
|||
"Education": "硕士" |
|||
}, |
|||
{ |
|||
"Title": "架构师", |
|||
"Company": "华为", |
|||
"Location": "深圳", |
|||
"Salary": "40-60K", |
|||
"Experience": "10年以上", |
|||
"Education": "硕士" |
|||
}, |
|||
{ |
|||
"Title": "前端开发工程师", |
|||
"Company": "百度", |
|||
"Location": "北京", |
|||
"Salary": "15-25K", |
|||
"Experience": "3-5年", |
|||
"Education": "本科" |
|||
}, |
|||
{ |
|||
"Title": "大数据开发", |
|||
"Company": "小米", |
|||
"Location": "北京", |
|||
"Salary": "20-35K", |
|||
"Experience": "3-5年", |
|||
"Education": "本科" |
|||
}, |
|||
{ |
|||
"Title": "测试工程师", |
|||
"Company": "滴滴", |
|||
"Location": "北京", |
|||
"Salary": "12-20K", |
|||
"Experience": "1-3年", |
|||
"Education": "本科" |
|||
}, |
|||
{ |
|||
"Title": "Java开发工程师", |
|||
"Company": "阿里巴巴", |
|||
"Location": "杭州", |
|||
"Salary": "15-25K", |
|||
"Experience": "3-5年", |
|||
"Education": "本科" |
|||
}, |
|||
{ |
|||
"Title": "后端开发工程师", |
|||
"Company": "腾讯", |
|||
"Location": "深圳", |
|||
"Salary": "20-35K", |
|||
"Experience": "5-10年", |
|||
"Education": "本科" |
|||
}, |
|||
{ |
|||
"Title": "全栈开发工程师", |
|||
"Company": "字节跳动", |
|||
"Location": "北京", |
|||
"Salary": "18-30K", |
|||
"Experience": "3-5年", |
|||
"Education": "本科" |
|||
}, |
|||
{ |
|||
"Title": "高级Java工程师", |
|||
"Company": "美团", |
|||
"Location": "北京", |
|||
"Salary": "25-40K", |
|||
"Experience": "5-10年", |
|||
"Education": "本科" |
|||
}, |
|||
{ |
|||
"Title": "软件工程师", |
|||
"Company": "京东", |
|||
"Location": "北京", |
|||
"Salary": "15-25K", |
|||
"Experience": "1-3年", |
|||
"Education": "本科" |
|||
}, |
|||
{ |
|||
"Title": "技术经理", |
|||
"Company": "网易", |
|||
"Location": "杭州", |
|||
"Salary": "30-50K", |
|||
"Experience": "10年以上", |
|||
"Education": "硕士" |
|||
}, |
|||
{ |
|||
"Title": "架构师", |
|||
"Company": "华为", |
|||
"Location": "深圳", |
|||
"Salary": "40-60K", |
|||
"Experience": "10年以上", |
|||
"Education": "硕士" |
|||
}, |
|||
{ |
|||
"Title": "前端开发工程师", |
|||
"Company": "百度", |
|||
"Location": "北京", |
|||
"Salary": "15-25K", |
|||
"Experience": "3-5年", |
|||
"Education": "本科" |
|||
}, |
|||
{ |
|||
"Title": "大数据开发", |
|||
"Company": "小米", |
|||
"Location": "北京", |
|||
"Salary": "20-35K", |
|||
"Experience": "3-5年", |
|||
"Education": "本科" |
|||
}, |
|||
{ |
|||
"Title": "测试工程师", |
|||
"Company": "滴滴", |
|||
"Location": "北京", |
|||
"Salary": "12-20K", |
|||
"Experience": "1-3年", |
|||
"Education": "本科" |
|||
} |
|||
] |
|||
|
@ -0,0 +1,452 @@ |
|||
[ |
|||
{ |
|||
"Title": "肖申克的救赎", |
|||
"Rating": "9.7", |
|||
"Year": "1994", |
|||
"Director": "弗兰克·德拉邦特" |
|||
}, |
|||
{ |
|||
"Title": "霸王别姬", |
|||
"Rating": "9.6", |
|||
"Year": "1993", |
|||
"Director": "陈凯歌" |
|||
}, |
|||
{ |
|||
"Title": "泰坦尼克号", |
|||
"Rating": "9.5", |
|||
"Year": "1997", |
|||
"Director": "詹姆斯·卡梅隆" |
|||
}, |
|||
{ |
|||
"Title": "阿甘正传", |
|||
"Rating": "9.5", |
|||
"Year": "1994", |
|||
"Director": "罗伯特·泽米吉斯" |
|||
}, |
|||
{ |
|||
"Title": "千与千寻", |
|||
"Rating": "9.4", |
|||
"Year": "2001", |
|||
"Director": "宫崎骏" |
|||
}, |
|||
{ |
|||
"Title": "美丽人生", |
|||
"Rating": "9.5", |
|||
"Year": "1997", |
|||
"Director": "罗伯托·贝尼尼" |
|||
}, |
|||
{ |
|||
"Title": "星际穿越", |
|||
"Rating": "9.4", |
|||
"Year": "2014", |
|||
"Director": "克里斯托弗·诺兰" |
|||
}, |
|||
{ |
|||
"Title": "这个杀手不太冷", |
|||
"Rating": "9.4", |
|||
"Year": "1994", |
|||
"Director": "吕克·贝松" |
|||
}, |
|||
{ |
|||
"Title": "盗梦空间", |
|||
"Rating": "9.4", |
|||
"Year": "2010", |
|||
"Director": "克里斯托弗·诺兰" |
|||
}, |
|||
{ |
|||
"Title": "楚门的世界", |
|||
"Rating": "9.4", |
|||
"Year": "1998", |
|||
"Director": "彼得·威尔" |
|||
}, |
|||
{ |
|||
"Title": "辛德勒的名单", |
|||
"Rating": "9.5", |
|||
"Year": "1993", |
|||
"Director": "史蒂文·斯皮尔伯格" |
|||
}, |
|||
{ |
|||
"Title": "忠犬八公的故事", |
|||
"Rating": "9.4", |
|||
"Year": "2009", |
|||
"Director": "莱塞·霍尔斯道姆" |
|||
}, |
|||
{ |
|||
"Title": "海上钢琴师", |
|||
"Rating": "9.3", |
|||
"Year": "1998", |
|||
"Director": "朱塞佩·托纳多雷" |
|||
}, |
|||
{ |
|||
"Title": "疯狂动物城", |
|||
"Rating": "9.3", |
|||
"Year": "2016", |
|||
"Director": "拜伦·霍华德" |
|||
}, |
|||
{ |
|||
"Title": "三傻大闹宝莱坞", |
|||
"Rating": "9.2", |
|||
"Year": "2009", |
|||
"Director": "拉库马·希拉尼" |
|||
}, |
|||
{ |
|||
"Title": "机器人总动员", |
|||
"Rating": "9.3", |
|||
"Year": "2008", |
|||
"Director": "安德鲁·斯坦顿" |
|||
}, |
|||
{ |
|||
"Title": "放牛班的春天", |
|||
"Rating": "9.3", |
|||
"Year": "2004", |
|||
"Director": "克里斯托夫·巴拉蒂" |
|||
}, |
|||
{ |
|||
"Title": "无间道", |
|||
"Rating": "9.3", |
|||
"Year": "2002", |
|||
"Director": "刘伟强" |
|||
}, |
|||
{ |
|||
"Title": "控方证人", |
|||
"Rating": "9.6", |
|||
"Year": "1957", |
|||
"Director": "比利·怀尔德" |
|||
}, |
|||
{ |
|||
"Title": "寻梦环游记", |
|||
"Rating": "9.1", |
|||
"Year": "2017", |
|||
"Director": "李·昂克里奇" |
|||
}, |
|||
{ |
|||
"Title": "大话西游之大圣娶亲", |
|||
"Rating": "9.2", |
|||
"Year": "1995", |
|||
"Director": "刘镇伟" |
|||
}, |
|||
{ |
|||
"Title": "熔炉", |
|||
"Rating": "9.3", |
|||
"Year": "2011", |
|||
"Director": "黄东赫" |
|||
}, |
|||
{ |
|||
"Title": "触不可及", |
|||
"Rating": "9.3", |
|||
"Year": "2011", |
|||
"Director": "奥利维·那卡什" |
|||
}, |
|||
{ |
|||
"Title": "教父", |
|||
"Rating": "9.3", |
|||
"Year": "1972", |
|||
"Director": "弗朗西斯·福特·科波拉" |
|||
}, |
|||
{ |
|||
"Title": "末代皇帝", |
|||
"Rating": "9.3", |
|||
"Year": "1987", |
|||
"Director": "贝纳尔多·贝托鲁奇" |
|||
}, |
|||
{ |
|||
"Title": "哈利·波特与魔法石", |
|||
"Rating": "9.2", |
|||
"Year": "2001", |
|||
"Director": "Chris" |
|||
}, |
|||
{ |
|||
"Title": "当幸福来敲门", |
|||
"Rating": "9.1", |
|||
"Year": "2006", |
|||
"Director": "加布里尔·穆奇诺" |
|||
}, |
|||
{ |
|||
"Title": "龙猫", |
|||
"Rating": "9.2", |
|||
"Year": "1988", |
|||
"Director": "宫崎骏" |
|||
}, |
|||
{ |
|||
"Title": "活着", |
|||
"Rating": "9.3", |
|||
"Year": "1994", |
|||
"Director": "张艺谋" |
|||
}, |
|||
{ |
|||
"Title": "怦然心动", |
|||
"Rating": "9.1", |
|||
"Year": "2010", |
|||
"Director": "罗伯·莱纳" |
|||
}, |
|||
{ |
|||
"Title": "蝙蝠侠:黑暗骑士", |
|||
"Rating": "9.2", |
|||
"Year": "2008", |
|||
"Director": "克里斯托弗·诺兰" |
|||
}, |
|||
{ |
|||
"Title": "指环王3:王者无敌", |
|||
"Rating": "9.3", |
|||
"Year": "2003", |
|||
"Director": "彼得·杰克逊" |
|||
}, |
|||
{ |
|||
"Title": "我不是药神", |
|||
"Rating": "9.0", |
|||
"Year": "2018", |
|||
"Director": "文牧野" |
|||
}, |
|||
{ |
|||
"Title": "乱世佳人", |
|||
"Rating": "9.3", |
|||
"Year": "1939", |
|||
"Director": "维克多·弗莱明" |
|||
}, |
|||
{ |
|||
"Title": "让子弹飞", |
|||
"Rating": "9.0", |
|||
"Year": "2010", |
|||
"Director": "姜文" |
|||
}, |
|||
{ |
|||
"Title": "飞屋环游记", |
|||
"Rating": "9.1", |
|||
"Year": "2009", |
|||
"Director": "彼特·道格特" |
|||
}, |
|||
{ |
|||
"Title": "哈尔的移动城堡", |
|||
"Rating": "9.1", |
|||
"Year": "2004", |
|||
"Director": "宫崎骏" |
|||
}, |
|||
{ |
|||
"Title": "十二怒汉", |
|||
"Rating": "9.4", |
|||
"Year": "1957", |
|||
"Director": "西德尼·吕美特" |
|||
}, |
|||
{ |
|||
"Title": "海蒂和爷爷", |
|||
"Rating": "9.3", |
|||
"Year": "2015", |
|||
"Director": "阿兰·葛斯彭纳" |
|||
}, |
|||
{ |
|||
"Title": "素媛", |
|||
"Rating": "9.3", |
|||
"Year": "2013", |
|||
"Director": "李濬益" |
|||
}, |
|||
{ |
|||
"Title": "猫鼠游戏", |
|||
"Rating": "9.1", |
|||
"Year": "2002", |
|||
"Director": "史蒂文·斯皮尔伯格" |
|||
}, |
|||
{ |
|||
"Title": "天空之城", |
|||
"Rating": "9.2", |
|||
"Year": "1986", |
|||
"Director": "宫崎骏" |
|||
}, |
|||
{ |
|||
"Title": "鬼子来了", |
|||
"Rating": "9.3", |
|||
"Year": "2000", |
|||
"Director": "姜文" |
|||
}, |
|||
{ |
|||
"Title": "摔跤吧!爸爸", |
|||
"Rating": "9.0", |
|||
"Year": "2016", |
|||
"Director": "涅提·蒂瓦里" |
|||
}, |
|||
{ |
|||
"Title": "少年派的奇幻漂流", |
|||
"Rating": "9.1", |
|||
"Year": "2012", |
|||
"Director": "李安" |
|||
}, |
|||
{ |
|||
"Title": "钢琴家", |
|||
"Rating": "9.3", |
|||
"Year": "2002", |
|||
"Director": "罗曼·波兰斯基" |
|||
}, |
|||
{ |
|||
"Title": "死亡诗社", |
|||
"Rating": "9.2", |
|||
"Year": "1989", |
|||
"Director": "彼得·威尔" |
|||
}, |
|||
{ |
|||
"Title": "指环王2:双塔奇兵", |
|||
"Rating": "9.2", |
|||
"Year": "2002", |
|||
"Director": "彼得·杰克逊" |
|||
}, |
|||
{ |
|||
"Title": "大话西游之月光宝盒", |
|||
"Rating": "9.0", |
|||
"Year": "1995", |
|||
"Director": "刘镇伟" |
|||
}, |
|||
{ |
|||
"Title": "绿皮书", |
|||
"Rating": "8.9", |
|||
"Year": "2018", |
|||
"Director": "彼得·法雷里" |
|||
}, |
|||
{ |
|||
"Title": "何以为家", |
|||
"Rating": "9.1", |
|||
"Year": "2018", |
|||
"Director": "娜丁·拉巴基" |
|||
}, |
|||
{ |
|||
"Title": "闻香识女人", |
|||
"Rating": "9.1", |
|||
"Year": "1992", |
|||
"Director": "马丁·布莱斯" |
|||
}, |
|||
{ |
|||
"Title": "大闹天宫", |
|||
"Rating": "9.4", |
|||
"Year": "0", |
|||
"Director": "万籁鸣" |
|||
}, |
|||
{ |
|||
"Title": "黑客帝国", |
|||
"Rating": "9.1", |
|||
"Year": "1999", |
|||
"Director": "安迪·沃卓斯基" |
|||
}, |
|||
{ |
|||
"Title": "指环王1:护戒使者", |
|||
"Rating": "9.1", |
|||
"Year": "2001", |
|||
"Director": "彼得·杰克逊" |
|||
}, |
|||
{ |
|||
"Title": "罗马假日", |
|||
"Rating": "9.1", |
|||
"Year": "1953", |
|||
"Director": "威廉·惠勒" |
|||
}, |
|||
{ |
|||
"Title": "教父2", |
|||
"Rating": "9.3", |
|||
"Year": "1974", |
|||
"Director": "弗朗西斯·福特·科波拉" |
|||
}, |
|||
{ |
|||
"Title": "狮子王", |
|||
"Rating": "9.1", |
|||
"Year": "1994", |
|||
"Director": "Roger" |
|||
}, |
|||
{ |
|||
"Title": "天堂电影院", |
|||
"Rating": "9.2", |
|||
"Year": "1988", |
|||
"Director": "朱塞佩·托纳多雷" |
|||
}, |
|||
{ |
|||
"Title": "饮食男女", |
|||
"Rating": "9.2", |
|||
"Year": "1994", |
|||
"Director": "李安" |
|||
}, |
|||
{ |
|||
"Title": "辩护人", |
|||
"Rating": "9.2", |
|||
"Year": "2013", |
|||
"Director": "杨宇硕" |
|||
}, |
|||
{ |
|||
"Title": "本杰明·巴顿奇事", |
|||
"Rating": "9.0", |
|||
"Year": "2008", |
|||
"Director": "大卫·芬奇" |
|||
}, |
|||
{ |
|||
"Title": "搏击俱乐部", |
|||
"Rating": "9.0", |
|||
"Year": "1999", |
|||
"Director": "大卫·芬奇" |
|||
}, |
|||
{ |
|||
"Title": "美丽心灵", |
|||
"Rating": "9.1", |
|||
"Year": "2001", |
|||
"Director": "朗·霍华德" |
|||
}, |
|||
{ |
|||
"Title": "穿条纹睡衣的男孩", |
|||
"Rating": "9.2", |
|||
"Year": "2008", |
|||
"Director": "马克·赫尔曼" |
|||
}, |
|||
{ |
|||
"Title": "哈利·波特与死亡圣器(下)", |
|||
"Rating": "9.0", |
|||
"Year": "2011", |
|||
"Director": "大卫·叶茨" |
|||
}, |
|||
{ |
|||
"Title": "情书", |
|||
"Rating": "8.9", |
|||
"Year": "1995", |
|||
"Director": "岩井俊二" |
|||
}, |
|||
{ |
|||
"Title": "两杆大烟枪", |
|||
"Rating": "9.1", |
|||
"Year": "1998", |
|||
"Director": "盖·里奇" |
|||
}, |
|||
{ |
|||
"Title": "窃听风暴", |
|||
"Rating": "9.2", |
|||
"Year": "2006", |
|||
"Director": "弗洛里安·亨克尔·冯·多纳斯马尔克" |
|||
}, |
|||
{ |
|||
"Title": "功夫", |
|||
"Rating": "8.9", |
|||
"Year": "2004", |
|||
"Director": "周星驰" |
|||
}, |
|||
{ |
|||
"Title": "音乐之声", |
|||
"Rating": "9.1", |
|||
"Year": "1965", |
|||
"Director": "罗伯特·怀斯" |
|||
}, |
|||
{ |
|||
"Title": "哈利·波特与阿兹卡班的囚徒", |
|||
"Rating": "9.0", |
|||
"Year": "2004", |
|||
"Director": "阿方索·卡隆" |
|||
}, |
|||
{ |
|||
"Title": "阿凡达", |
|||
"Rating": "8.8", |
|||
"Year": "2009", |
|||
"Director": "詹姆斯·卡梅隆" |
|||
}, |
|||
{ |
|||
"Title": "西西里的美丽传说", |
|||
"Rating": "8.9", |
|||
"Year": "2000", |
|||
"Director": "朱塞佩·托纳多雷" |
|||
}, |
|||
{ |
|||
"Title": "看不见的客人", |
|||
"Rating": "8.8", |
|||
"Year": "2016", |
|||
"Director": "奥里奥尔·保罗" |
|||
} |
|||
] |
|||
|
@ -0,0 +1,122 @@ |
|||
[ |
|||
{ |
|||
"Title": "静夜思", |
|||
"Author": "李白", |
|||
"Dynasty": "唐代", |
|||
"Content": "床前明月光\n疑是地上霜\n举头望明月\n低头思故乡" |
|||
}, |
|||
{ |
|||
"Title": "春晓", |
|||
"Author": "孟浩然", |
|||
"Dynasty": "唐代", |
|||
"Content": "春眠不觉晓\n处处闻啼鸟\n夜来风雨声\n花落知多少" |
|||
}, |
|||
{ |
|||
"Title": "登鹳雀楼", |
|||
"Author": "王之涣", |
|||
"Dynasty": "唐代", |
|||
"Content": "白日依山尽\n黄河入海流\n欲穷千里目\n更上一层楼" |
|||
}, |
|||
{ |
|||
"Title": "相思", |
|||
"Author": "王维", |
|||
"Dynasty": "唐代", |
|||
"Content": "红豆生南国\n春来发几枝\n愿君多采撷\n此物最相思" |
|||
}, |
|||
{ |
|||
"Title": "悯农", |
|||
"Author": "李绅", |
|||
"Dynasty": "唐代", |
|||
"Content": "锄禾日当午\n汗滴禾下土\n谁知盘中餐\n粒粒皆辛苦" |
|||
}, |
|||
{ |
|||
"Title": "咏鹅", |
|||
"Author": "骆宾王", |
|||
"Dynasty": "唐代", |
|||
"Content": "鹅鹅鹅\n曲项向天歌\n白毛浮绿水\n红掌拨清波" |
|||
}, |
|||
{ |
|||
"Title": "江雪", |
|||
"Author": "柳宗元", |
|||
"Dynasty": "唐代", |
|||
"Content": "千山鸟飞绝\n万径人踪灭\n孤舟蓑笠翁\n独钓寒江雪" |
|||
}, |
|||
{ |
|||
"Title": "望庐山瀑布", |
|||
"Author": "李白", |
|||
"Dynasty": "唐代", |
|||
"Content": "日照香炉生紫烟\n遥看瀑布挂前川\n飞流直下三千尺\n疑是银河落九天" |
|||
}, |
|||
{ |
|||
"Title": "出塞", |
|||
"Author": "王昌龄", |
|||
"Dynasty": "唐代", |
|||
"Content": "秦时明月汉时关\n万里长征人未还\n但使龙城飞将在\n不教胡马度阴山" |
|||
}, |
|||
{ |
|||
"Title": "绝句", |
|||
"Author": "杜甫", |
|||
"Dynasty": "唐代", |
|||
"Content": "两个黄鹂鸣翠柳\n一行白鹭上青天\n窗含西岭千秋雪\n门泊东吴万里船" |
|||
}, |
|||
{ |
|||
"Title": "静夜思", |
|||
"Author": "李白", |
|||
"Dynasty": "唐代", |
|||
"Content": "床前明月光\n疑是地上霜\n举头望明月\n低头思故乡" |
|||
}, |
|||
{ |
|||
"Title": "春晓", |
|||
"Author": "孟浩然", |
|||
"Dynasty": "唐代", |
|||
"Content": "春眠不觉晓\n处处闻啼鸟\n夜来风雨声\n花落知多少" |
|||
}, |
|||
{ |
|||
"Title": "登鹳雀楼", |
|||
"Author": "王之涣", |
|||
"Dynasty": "唐代", |
|||
"Content": "白日依山尽\n黄河入海流\n欲穷千里目\n更上一层楼" |
|||
}, |
|||
{ |
|||
"Title": "相思", |
|||
"Author": "王维", |
|||
"Dynasty": "唐代", |
|||
"Content": "红豆生南国\n春来发几枝\n愿君多采撷\n此物最相思" |
|||
}, |
|||
{ |
|||
"Title": "悯农", |
|||
"Author": "李绅", |
|||
"Dynasty": "唐代", |
|||
"Content": "锄禾日当午\n汗滴禾下土\n谁知盘中餐\n粒粒皆辛苦" |
|||
}, |
|||
{ |
|||
"Title": "咏鹅", |
|||
"Author": "骆宾王", |
|||
"Dynasty": "唐代", |
|||
"Content": "鹅鹅鹅\n曲项向天歌\n白毛浮绿水\n红掌拨清波" |
|||
}, |
|||
{ |
|||
"Title": "江雪", |
|||
"Author": "柳宗元", |
|||
"Dynasty": "唐代", |
|||
"Content": "千山鸟飞绝\n万径人踪灭\n孤舟蓑笠翁\n独钓寒江雪" |
|||
}, |
|||
{ |
|||
"Title": "望庐山瀑布", |
|||
"Author": "李白", |
|||
"Dynasty": "唐代", |
|||
"Content": "日照香炉生紫烟\n遥看瀑布挂前川\n飞流直下三千尺\n疑是银河落九天" |
|||
}, |
|||
{ |
|||
"Title": "出塞", |
|||
"Author": "王昌龄", |
|||
"Dynasty": "唐代", |
|||
"Content": "秦时明月汉时关\n万里长征人未还\n但使龙城飞将在\n不教胡马度阴山" |
|||
}, |
|||
{ |
|||
"Title": "绝句", |
|||
"Author": "杜甫", |
|||
"Dynasty": "唐代", |
|||
"Content": "两个黄鹂鸣翠柳\n一行白鹭上青天\n窗含西岭千秋雪\n门泊东吴万里船" |
|||
} |
|||
] |
|||
@ -0,0 +1,38 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" |
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|||
<modelVersion>4.0.0</modelVersion> |
|||
|
|||
<groupId>com.example</groupId> |
|||
<artifactId>datacollect</artifactId> |
|||
<version>1.0-SNAPSHOT</version> |
|||
|
|||
<properties> |
|||
<maven.compiler.source>8</maven.compiler.source> |
|||
<maven.compiler.target>8</maven.compiler.target> |
|||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
|||
</properties> |
|||
|
|||
<dependencies> |
|||
<dependency> |
|||
<groupId>org.jsoup</groupId> |
|||
<artifactId>jsoup</artifactId> |
|||
<version>1.17.2</version> |
|||
</dependency> |
|||
</dependencies> |
|||
|
|||
<build> |
|||
<plugins> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-compiler-plugin</artifactId> |
|||
<version>3.8.1</version> |
|||
<configuration> |
|||
<source>8</source> |
|||
<target>8</target> |
|||
</configuration> |
|||
</plugin> |
|||
</plugins> |
|||
</build> |
|||
</project> |
|||
@ -0,0 +1,11 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<module type="JAVA_MODULE" version="4"> |
|||
<component name="NewModuleRootManager" inherit-compiler-output="true"> |
|||
<exclude-output /> |
|||
<content url="file://$MODULE_DIR$"> |
|||
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" /> |
|||
</content> |
|||
<orderEntry type="inheritedJdk" /> |
|||
<orderEntry type="sourceFolder" forTests="false" /> |
|||
</component> |
|||
</module> |
|||
@ -0,0 +1,119 @@ |
|||
package project; |
|||
|
|||
import project.analysis.JobAnalyzer; |
|||
import project.analysis.MovieAnalyzer; |
|||
import project.analysis.PoemAnalyzer; |
|||
import project.bean.Job; |
|||
import project.bean.Movie; |
|||
import project.bean.Poem; |
|||
import project.crawler.JobCrawler; |
|||
import project.crawler.MovieCrawler; |
|||
import project.crawler.PoemCrawler; |
|||
import project.exception.CrawlerException; |
|||
import project.utils.DataStorage; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
public class AutoTest { |
|||
public static void main(String[] args) { |
|||
System.out.println("=== 多源数据爬取与分析系统 - 自动测试 ==="); |
|||
System.out.println("当前时间: 2026-05-23 14:47:45"); |
|||
System.out.println("当前地点: 湖南省长沙市"); |
|||
System.out.println(); |
|||
|
|||
// 1. 测试豆瓣电影爬虫
|
|||
System.out.println("【1/3】正在爬取豆瓣电影 Top 250..."); |
|||
try { |
|||
MovieCrawler movieCrawler = new MovieCrawler(); |
|||
List<Movie> movies = movieCrawler.crawl(3); |
|||
System.out.println("成功爬取 " + movies.size() + " 部电影"); |
|||
|
|||
if (!movies.isEmpty()) { |
|||
try { |
|||
DataStorage.saveToCsv(movies, "output/movies.csv"); |
|||
DataStorage.saveToJson(movies, "output/movies.json"); |
|||
System.out.println("数据已保存到文件: output/movies.csv"); |
|||
System.out.println("数据已保存到JSON文件: output/movies.json"); |
|||
} catch (IOException e) { |
|||
System.out.println("保存电影数据失败: " + e.getMessage()); |
|||
} |
|||
|
|||
System.out.println("\n【电影数据分析】"); |
|||
System.out.println("总数: " + movies.size()); |
|||
System.out.printf("平均评分: %.2f%n", MovieAnalyzer.calculateAverageRating(movies)); |
|||
System.out.println("\n评分分布:"); |
|||
Map<String, Long> ratingDist = MovieAnalyzer.analyzeRatingDistribution(movies); |
|||
ratingDist.forEach((key, value) -> System.out.printf(" %-10s %d 部%n", key, value)); |
|||
} else { |
|||
System.out.println("电影数据为空,跳过保存和分析"); |
|||
} |
|||
} catch (CrawlerException e) { |
|||
System.out.println("爬取电影失败: " + e.getMessage()); |
|||
} |
|||
|
|||
// 2. 测试前程无忧爬虫
|
|||
System.out.println("\n【2/3】正在爬取前程无忧招聘数据..."); |
|||
try { |
|||
JobCrawler jobCrawler = new JobCrawler(); |
|||
List<Job> jobs = jobCrawler.crawl(2); |
|||
System.out.println("成功爬取 " + jobs.size() + " 条招聘信息"); |
|||
|
|||
if (!jobs.isEmpty()) { |
|||
try { |
|||
DataStorage.saveToCsv(jobs, "output/jobs.csv"); |
|||
DataStorage.saveToJson(jobs, "output/jobs.json"); |
|||
System.out.println("数据已保存到文件: output/jobs.csv"); |
|||
System.out.println("数据已保存到JSON文件: output/jobs.json"); |
|||
} catch (IOException e) { |
|||
System.out.println("保存招聘数据失败: " + e.getMessage()); |
|||
} |
|||
|
|||
System.out.println("\n【招聘数据分析】"); |
|||
System.out.println("总数: " + jobs.size()); |
|||
System.out.println("城市分布(Top5):"); |
|||
Map<String, Long> locationDist = JobAnalyzer.analyzeLocationDistribution(jobs); |
|||
locationDist.forEach((key, value) -> System.out.printf(" %-10s %d 个职位%n", key, value)); |
|||
} else { |
|||
System.out.println("招聘数据为空,跳过保存和分析"); |
|||
} |
|||
} catch (CrawlerException e) { |
|||
System.out.println("爬取招聘信息失败: " + e.getMessage()); |
|||
} |
|||
|
|||
// 3. 测试古诗词爬虫
|
|||
System.out.println("\n【3/3】正在爬取古诗词数据..."); |
|||
try { |
|||
PoemCrawler poemCrawler = new PoemCrawler(); |
|||
List<Poem> poems = poemCrawler.crawl(2); |
|||
System.out.println("成功爬取 " + poems.size() + " 首诗词"); |
|||
|
|||
if (!poems.isEmpty()) { |
|||
try { |
|||
DataStorage.saveToCsv(poems, "output/poems.csv"); |
|||
DataStorage.saveToJson(poems, "output/poems.json"); |
|||
System.out.println("数据已保存到文件: output/poems.csv"); |
|||
System.out.println("数据已保存到JSON文件: output/poems.json"); |
|||
} catch (IOException e) { |
|||
System.out.println("保存诗词数据失败: " + e.getMessage()); |
|||
} |
|||
|
|||
System.out.println("\n【诗词数据分析】"); |
|||
System.out.println("总数: " + poems.size()); |
|||
System.out.printf("平均长度: %.2f 字%n", PoemAnalyzer.calculateAverageLength(poems)); |
|||
System.out.println("\n朝代分布:"); |
|||
Map<String, Long> dynastyDist = PoemAnalyzer.analyzeDynastyDistribution(poems); |
|||
dynastyDist.forEach((key, value) -> System.out.printf(" %-5s %d 首%n", key, value)); |
|||
} else { |
|||
System.out.println("诗词数据为空,跳过保存和分析"); |
|||
} |
|||
} catch (CrawlerException e) { |
|||
System.out.println("爬取诗词失败: " + e.getMessage()); |
|||
} |
|||
|
|||
System.out.println("\n=== 数据爬取与分析完成 ==="); |
|||
System.out.println("数据已保存到 output/ 目录"); |
|||
} |
|||
} |
|||
@ -0,0 +1,28 @@ |
|||
package project; |
|||
|
|||
import project.view.ConsoleView; |
|||
import project.controller.CrawlerController; |
|||
|
|||
import java.io.File; |
|||
|
|||
public class Main { |
|||
public static void main(String[] args) { |
|||
ConsoleView view = new ConsoleView(); |
|||
CrawlerController controller = new CrawlerController(view); |
|||
|
|||
new File("output").mkdirs(); |
|||
|
|||
view.printWelcome(); |
|||
view.printInfo("输入 help 查看可用命令"); |
|||
|
|||
while (true) { |
|||
String input = view.readCommand(); |
|||
|
|||
if (controller.isExitCommand(input)) { |
|||
break; |
|||
} |
|||
|
|||
controller.execute(input); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,76 @@ |
|||
package project.analysis; |
|||
|
|||
import project.bean.Job; |
|||
|
|||
import java.util.*; |
|||
import java.util.stream.Collectors; |
|||
|
|||
public class JobAnalyzer { |
|||
public static Map<String, Long> analyzeLocationDistribution(List<Job> jobs) { |
|||
return jobs.stream() |
|||
.filter(j -> j.getLocation() != null && !j.getLocation().isEmpty()) |
|||
.collect(Collectors.groupingBy(Job::getLocation, Collectors.counting())) |
|||
.entrySet().stream() |
|||
.sorted(Map.Entry.<String, Long>comparingByValue().reversed()) |
|||
.limit(10) |
|||
.collect(Collectors.toMap( |
|||
Map.Entry::getKey, |
|||
Map.Entry::getValue, |
|||
(e1, e2) -> e1, |
|||
LinkedHashMap::new |
|||
)); |
|||
} |
|||
|
|||
public static Map<String, Long> analyzeExperienceDistribution(List<Job> jobs) { |
|||
return jobs.stream() |
|||
.filter(j -> j.getExperience() != null && !j.getExperience().isEmpty()) |
|||
.collect(Collectors.groupingBy(Job::getExperience, Collectors.counting())); |
|||
} |
|||
|
|||
public static Map<String, Long> analyzeEducationDistribution(List<Job> jobs) { |
|||
return jobs.stream() |
|||
.filter(j -> j.getEducation() != null && !j.getEducation().isEmpty()) |
|||
.collect(Collectors.groupingBy(Job::getEducation, Collectors.counting())); |
|||
} |
|||
|
|||
public static Map<String, Long> analyzeSalaryDistribution(List<Job> jobs) { |
|||
return jobs.stream() |
|||
.filter(j -> j.getSalary() != null && !j.getSalary().isEmpty()) |
|||
.collect(Collectors.groupingBy(Job::getSalary, Collectors.counting())) |
|||
.entrySet().stream() |
|||
.sorted(Map.Entry.<String, Long>comparingByValue().reversed()) |
|||
.limit(10) |
|||
.collect(Collectors.toMap( |
|||
Map.Entry::getKey, |
|||
Map.Entry::getValue, |
|||
(e1, e2) -> e1, |
|||
LinkedHashMap::new |
|||
)); |
|||
} |
|||
|
|||
public static Map<String, Double> analyzeSalaryByExperience(List<Job> jobs) { |
|||
return jobs.stream() |
|||
.filter(j -> j.getExperience() != null && !j.getExperience().isEmpty() && |
|||
j.getSalary() != null && !j.getSalary().isEmpty()) |
|||
.collect(Collectors.groupingBy( |
|||
Job::getExperience, |
|||
Collectors.averagingDouble(j -> extractAvgSalary(j.getSalary())) |
|||
)); |
|||
} |
|||
|
|||
private static double extractAvgSalary(String salary) { |
|||
// 解析薪资如 "10-15K" -> 12.5
|
|||
try { |
|||
String cleanSalary = salary.replace("K", "").replace("k", ""); |
|||
String[] parts = cleanSalary.split("-"); |
|||
if (parts.length == 2) { |
|||
double min = Double.parseDouble(parts[0].trim()); |
|||
double max = Double.parseDouble(parts[1].trim()); |
|||
return (min + max) / 2; |
|||
} |
|||
} catch (Exception e) { |
|||
// ignore
|
|||
} |
|||
return 0.0; |
|||
} |
|||
} |
|||
@ -0,0 +1,73 @@ |
|||
package project.analysis; |
|||
|
|||
import project.bean.Poem; |
|||
|
|||
import java.util.*; |
|||
import java.util.stream.Collectors; |
|||
|
|||
public class PoemAnalyzer { |
|||
public static Map<String, Long> analyzeDynastyDistribution(List<Poem> poems) { |
|||
return poems.stream() |
|||
.filter(p -> p.getDynasty() != null && !p.getDynasty().equals("Unknown")) |
|||
.collect(Collectors.groupingBy(Poem::getDynasty, Collectors.counting())); |
|||
} |
|||
|
|||
public static Map<String, Long> analyzeAuthorTop10(List<Poem> poems) { |
|||
return poems.stream() |
|||
.filter(p -> p.getAuthor() != null && !p.getAuthor().equals("Unknown")) |
|||
.collect(Collectors.groupingBy(Poem::getAuthor, Collectors.counting())) |
|||
.entrySet().stream() |
|||
.sorted(Map.Entry.<String, Long>comparingByValue().reversed()) |
|||
.limit(10) |
|||
.collect(Collectors.toMap( |
|||
Map.Entry::getKey, |
|||
Map.Entry::getValue, |
|||
(e1, e2) -> e1, |
|||
LinkedHashMap::new |
|||
)); |
|||
} |
|||
|
|||
public static Map<String, Long> extractHighFrequencyWords(List<Poem> poems, int topN) { |
|||
Map<String, Long> wordCount = new HashMap<>(); |
|||
|
|||
// 常见停用词
|
|||
Set<String> stopWords = new HashSet<>(Arrays.asList( |
|||
"的", "了", "和", "是", "就", "都", "而", "及", "与", "着", "或", |
|||
"一个", "没有", "我们", "你们", "他们", "它", "这", "那", "此", |
|||
"在", "有", "不", "能", "会", "可以", "要", "应该", "可能", |
|||
"上", "下", "前", "后", "左", "右", "中", "间", "里", "外", |
|||
"来", "去", "过", "到", "出", "入", "进", "回", "起", "走" |
|||
)); |
|||
|
|||
for (Poem poem : poems) { |
|||
if (poem.getContent() != null && !poem.getContent().isEmpty()) { |
|||
String content = poem.getContent(); |
|||
// 简单分词:按字分割(中文)
|
|||
for (int i = 0; i < content.length(); i++) { |
|||
String word = String.valueOf(content.charAt(i)); |
|||
if (!stopWords.contains(word) && word.matches("[\\u4e00-\\u9fa5]")) { |
|||
wordCount.merge(word, 1L, Long::sum); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
return wordCount.entrySet().stream() |
|||
.sorted(Map.Entry.<String, Long>comparingByValue().reversed()) |
|||
.limit(topN) |
|||
.collect(Collectors.toMap( |
|||
Map.Entry::getKey, |
|||
Map.Entry::getValue, |
|||
(e1, e2) -> e1, |
|||
LinkedHashMap::new |
|||
)); |
|||
} |
|||
|
|||
public static double calculateAverageLength(List<Poem> poems) { |
|||
return poems.stream() |
|||
.filter(p -> p.getContent() != null) |
|||
.mapToInt(p -> p.getContent().length()) |
|||
.average() |
|||
.orElse(0.0); |
|||
} |
|||
} |
|||