Browse Source

我已提交了期末实验报告

main
zhangsiyuan 3 weeks ago
parent
commit
ecae7e9799
  1. 10
      project/.idea/.gitignore
  2. 1
      project/.idea/.name
  3. 6
      project/.idea/misc.xml
  4. 8
      project/.idea/modules.xml
  5. 6
      project/.idea/vcs.xml
  6. 638
      project/202401070104-张思渊-期末实验报告.md
  7. BIN
      project/202401070104-张思渊-期末实验报告docx.docx
  8. BIN
      project/202401070104-张思渊-期末实验报告docx.pdf
  9. BIN
      project/bin/Main.class
  10. BIN
      project/bin/com/example/datacollect/CrawlTest.class
  11. BIN
      project/bin/com/example/datacollect/Main.class
  12. BIN
      project/bin/com/example/datacollect/TestHtml.class
  13. BIN
      project/bin/com/example/datacollect/command/AnalyzeCommand.class
  14. BIN
      project/bin/com/example/datacollect/command/Command.class
  15. BIN
      project/bin/com/example/datacollect/command/CrawlCommand.class
  16. BIN
      project/bin/com/example/datacollect/command/ExitCommand.class
  17. BIN
      project/bin/com/example/datacollect/command/ExportCommand.class
  18. BIN
      project/bin/com/example/datacollect/command/HelpCommand.class
  19. BIN
      project/bin/com/example/datacollect/command/HistoryCommand.class
  20. BIN
      project/bin/com/example/datacollect/command/ListCommand.class
  21. BIN
      project/bin/com/example/datacollect/controller/CrawlerController.class
  22. BIN
      project/bin/com/example/datacollect/exception/CrawlerException.class
  23. BIN
      project/bin/com/example/datacollect/exception/NetworkException.class
  24. BIN
      project/bin/com/example/datacollect/exception/ParseException.class
  25. BIN
      project/bin/com/example/datacollect/model/Article.class
  26. BIN
      project/bin/com/example/datacollect/repository/ArticleRepository.class
  27. BIN
      project/bin/com/example/datacollect/strategy/CrawlStrategy.class
  28. BIN
      project/bin/com/example/datacollect/strategy/DoubanBookStrategy.class
  29. BIN
      project/bin/com/example/datacollect/strategy/DoubanMovieStrategy.class
  30. BIN
      project/bin/com/example/datacollect/strategy/PoetryStrategy.class
  31. BIN
      project/bin/com/example/datacollect/strategy/StrategyFactory.class
  32. BIN
      project/bin/com/example/datacollect/utils/DataCleaner.class
  33. BIN
      project/bin/com/example/datacollect/utils/HttpUtils.class
  34. BIN
      project/bin/com/example/datacollect/view/ConsoleView.class
  35. BIN
      project/bin/project/AutoTest.class
  36. BIN
      project/bin/project/Main.class
  37. BIN
      project/bin/project/analysis/BookAnalyzer.class
  38. BIN
      project/bin/project/analysis/JobAnalyzer.class
  39. BIN
      project/bin/project/analysis/MovieAnalyzer.class
  40. BIN
      project/bin/project/analysis/PoemAnalyzer.class
  41. BIN
      project/bin/project/bean/Book.class
  42. BIN
      project/bin/project/bean/Job.class
  43. BIN
      project/bin/project/bean/Movie.class
  44. BIN
      project/bin/project/bean/Poem.class
  45. BIN
      project/bin/project/bean/Quote.class
  46. BIN
      project/bin/project/command/AnalyzeCommand.class
  47. BIN
      project/bin/project/command/Command.class
  48. BIN
      project/bin/project/command/CrawlCommand.class
  49. BIN
      project/bin/project/command/ExitCommand.class
  50. BIN
      project/bin/project/command/HelpCommand.class
  51. BIN
      project/bin/project/command/HistoryCommand.class
  52. BIN
      project/bin/project/command/ListCommand.class
  53. BIN
      project/bin/project/command/SaveCommand.class
  54. BIN
      project/bin/project/controller/CrawlerController.class
  55. BIN
      project/bin/project/core/AbstractWebCrawler.class
  56. BIN
      project/bin/project/core/DataEntity.class
  57. BIN
      project/bin/project/core/WebCrawler.class
  58. BIN
      project/bin/project/crawler/BookCrawler.class
  59. BIN
      project/bin/project/crawler/JobCrawler.class
  60. BIN
      project/bin/project/crawler/MovieCrawler.class
  61. BIN
      project/bin/project/crawler/PoemCrawler.class
  62. BIN
      project/bin/project/display/ResultDisplay.class
  63. BIN
      project/bin/project/exception/CrawlerException.class
  64. BIN
      project/bin/project/exception/ParseException.class
  65. BIN
      project/bin/project/strategy/CrawlStrategy.class
  66. BIN
      project/bin/project/strategy/CrawlerContext.class
  67. BIN
      project/bin/project/strategy/JobCrawlStrategy.class
  68. BIN
      project/bin/project/strategy/MovieCrawlStrategy.class
  69. BIN
      project/bin/project/strategy/PoemCrawlStrategy.class
  70. BIN
      project/bin/project/utils/DataCleaner.class
  71. BIN
      project/bin/project/utils/DataStorage.class
  72. BIN
      project/bin/project/utils/HttpUtils.class
  73. BIN
      project/bin/project/view/ConsoleView.class
  74. BIN
      project/bin/project/visualization/ChartGenerator.class
  75. BIN
      project/images/1.png
  76. BIN
      project/images/2.png
  77. BIN
      project/images/3.png
  78. BIN
      project/images/4.png
  79. BIN
      project/images/5.png
  80. BIN
      project/images/6.png
  81. BIN
      project/images/7.png
  82. BIN
      project/images/8.png
  83. BIN
      project/output/charts/movie_rating_distribution.png
  84. BIN
      project/output/charts/movie_top_directors.png
  85. BIN
      project/output/charts/rating_distribution.png
  86. BIN
      project/output/charts/rating_range_pie.png
  87. BIN
      project/output/charts/top_directors.png
  88. BIN
      project/output/charts/year_rating_correlation.png
  89. 21
      project/output/jobs.csv
  90. 162
      project/output/jobs.json
  91. 76
      project/output/movies.csv
  92. 452
      project/output/movies.json
  93. 81
      project/output/poems.csv
  94. 122
      project/output/poems.json
  95. 38
      project/pom.xml
  96. 11
      project/project.iml
  97. 119
      project/src/project/AutoTest.java
  98. 28
      project/src/project/Main.java
  99. 76
      project/src/project/analysis/JobAnalyzer.java
  100. 73
      project/src/project/analysis/PoemAnalyzer.java

10
project/.idea/.gitignore

@ -0,0 +1,10 @@
# 默认忽略的文件
/shelf/
/workspace.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
# 依赖于环境的 Maven 主目录路径
/mavenHomeManager.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

1
project/.idea/.name

@ -0,0 +1 @@
ConsoleView.java

6
project/.idea/misc.xml

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="true" project-jdk-name="1.8" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

8
project/.idea/modules.xml

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/project.iml" filepath="$PROJECT_DIR$/project.iml" />
</modules>
</component>
</project>

6
project/.idea/vcs.xml

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
</component>
</project>

638
project/202401070104-张思渊-期末实验报告.md

@ -0,0 +1,638 @@
# 《高级程序设计》项目报告:
爬虫项目开发全过程记录
## 一、项目目标
### 1.1 功能目标
| 功能 | 描述 | 优先级 |
|------|------|--------|
| 爬取豆瓣电影数据 | 爬取豆瓣电影Top250的电影标题、评分、年份、导演等信息 | 高 |
| 爬取前程无忧招聘数据 | 爬取Java相关职位的职位名称、公司、薪资、城市、经验要求等信息 | 高 |
| 爬取古诗词数据 | 爬取古诗词网站的诗词标题、作者、朝代、内容等信息 | 高 |
| 数据清洗 | 去除HTML标签、空格、特殊字符,格式化日期,处理缺失值 | 高 |
| 数据存储 | 将清洗后的数据保存为CSV和JSON格式文件 | 高 |
| 数据分析 | 使用Stream API进行统计分析,如评分分布、薪资分析、高频词提取 | 中 |
| CLI交互界面 | 实现命令行交互界面,支持用户输入命令操作 | 中 |
| 结果展示 | 控制台打印统计表格,生成分析报告 | 中 |
### 1.2 预期效果
(1)成功爬取3个不同网站的数据,每个网站至少爬取100条记录。
(2)数据清洗后保存为结构化文件,便于后续分析。
(3)通过CLI界面实现交互式操作,支持命令输入。
(4)提供数据统计分析功能,输出可视化报告。
(5)实现真正的MVC三层架构分离。
---
## 二、项目进展
### W1:类与对象基础,构造方法与封装
**本周任务:**
- 实现Movie实体类,包含title、rating、year、director字段
- 实现Job实体类,包含title、company、location、salary、experience、education字段
- 实现Poem实体类,包含title、author、dynasty、content字段
**所学知识:**
- Java封装性原理
- private关键字的使用
- Getter和Setter方法的设计
- 构造方法重载
**遇到的困难:**
- 觉得Java写Getter/Setter很繁琐,不理解为什么不能像Python一样直接访问属性
**如何解决的:**
- 通过查找资料和询问ai,理解了封装是为了数据安全和后期维护,确保数据完整性
**AI是如何帮助的:**
- 将Python类代码喂给AI,AI生成了对应的Java代码
- AI解释了访问修饰符的作用和封装的意义
- AI建议了接口设计方案,实现数据处理的统一
---
### W2:继承与方法重写
**本周任务:**
- 实现AbstractWebCrawler抽象类,包含crawl()和parse()方法
- 实现MovieCrawler子类,重写父类方法
- 实现JobCrawler子类,重写父类方法
- 实现PoemCrawler子类,重写父类方法
**所学知识:**
- extends关键字实现继承
- @Override注解标记方法重写
- super关键字调用父类构造方法
- 抽象类与抽象方法的定义
**遇到的困难:**
- 子类构造方法中调用父类构造方法时参数传递错误
- 抽象方法的实现逻辑不清晰
**如何解决的:**
- 查阅Java文档,理解super()必须放在构造方法第一行
- 分析不同网站的HTML结构,设计针对性的解析逻辑
- 使用正则表达式提取页面数据
**AI是如何帮助的:**
- AI检查了继承关系的合理性
- AI生成了类图的Mermaid代码,帮助理解类结构
- AI提供了正则表达式的编写建议
---
### W3:多态实现
**本周任务:**
- 通过父类引用调用不同爬虫的爬取方法
- 使用List<AbstractWebCrawler>统一管理所有爬虫
- 实现爬虫的动态切换
**所学知识:**
- 向上转型的概念
- 动态绑定机制
- instanceof关键字的使用
- 多态的实际应用场景
**遇到的困难:**
- 不理解为什么父类引用可以调用子类重写的方法
- 不知道如何设计统一的爬虫调度机制
**如何解决的:**
- 通过调试代码,观察运行时的方法调用过程
- 理解了多态的本质是运行时类型识别
- 设计CrawlerManager统一管理爬虫实例
**AI是如何帮助的:**
- AI用生活化的比喻"遥控器控制不同电器"解释了多态的概念
- AI演示了多态在实际项目中的应用场景
- AI帮助设计了爬虫管理类的结构
---
### W4:抽象类与接口
**本周任务:**
- 设计ICrawler接口
- 设计IAnalyzer接口
- 让AbstractWebCrawler实现ICrawler接口
- 定义DataEntity接口统一数据访问
**所学知识:**
- interface关键字定义接口
- implements关键字实现接口
- 接口与抽象类的区别
- 接口的多实现特性
**遇到的困难:**
- 不确定什么时候用抽象类,什么时候用接口
- 接口方法的设计不够合理
**如何解决的:**
- 遵循"is-a用抽象类,has-a/can-do用接口"的原则
- 将爬虫的通用逻辑放在抽象类中,具体行为定义在接口中
- 通过小组讨论确定接口设计方案
**AI是如何帮助的:**
- AI演示了如何用接口解耦臃肿的代码
- AI对比了抽象类和接口的使用场景
- AI建议了合理的接口设计方案
---
### W5:加入异常处理
**本周任务:**
- 自定义CrawlerException异常类
- 自定义ParseException异常类
- 在Controller层统一捕获异常
- 给出友好的错误提示
**所学知识:**
- try-catch-finally异常处理结构
- throws关键字声明异常
- 自定义异常类的实现
- 异常继承体系的设计
**遇到的困难:**
- 网络请求超时导致程序崩溃,没有友好的错误提示
- 异常处理逻辑过于分散
**如何解决的:**
- 封装了CrawlerException,统一处理爬虫相关异常
- 在Controller层使用try-catch统一捕获异常
- 设计异常处理中间件,提供友好的错误提示
**AI是如何帮助的:**
- AI生成了异常体系的骨架代码
- AI建议了合理的异常继承结构
- AI帮助设计了异常处理的最佳实践
---
### W6:泛型与集合框架
**本周任务:**
- 使用List<Movie>、List<Job>、List<Poem>管理数据
- 使用Stream API进行数据统计和分析
- 使用Map进行数据分组和计数
**所学知识:**
- 泛型类和泛型方法
- List、Map接口的使用
- Stream API的链式调用
- Lambda表达式的应用
**遇到的困难:**
- Stream API的链式调用容易写错
- 泛型类型擦除导致编译错误
- 复杂的数据统计逻辑难以实现
**如何解决的:**
- 通过IDE的类型提示逐步修正代码
- 学习Stream API的常用操作方法
- 将复杂统计逻辑拆分为多个简单步骤
**AI是如何帮助的:**
- AI将一段传统的for循环代码改写为Stream API风格
- AI提供了Stream API的常用操作示例
- AI帮助调试泛型相关的编译错误
---
### W7:实现 CLI + MVC + Command模式 + 策略模式
**本周任务:**
- 划分Model/View/Controller职责
- 实现Command接口和具体命令类
- 实现策略模式处理不同爬取策略
- 实现CLI交互界面
**所学知识:**
- MVC架构模式
- Command设计模式
- Strategy设计模式
- CLI交互设计原则
**遇到的困难:**
- Controller中不小心混入了打印逻辑,违反了MVC原则
- 命令模式的实现不够灵活
**如何解决的:**
- 将打印逻辑移到View层
- 使用Map存储命令实例,实现命令的动态注册
- 设计命令别名机制,提高用户体验
**AI是如何帮助的:**
- AI检查了代码的MVC划分,指出问题所在
- AI提供了Command模式的实现模板
- AI建议了策略模式的设计方案
---
### W8:文件 I/O 与序列化
**本周任务:**
- 将数据写入CSV文件
- 将数据写入JSON文件
- 支持从文件读取数据
- 处理文件编码问题
**所学知识:**
- FileWriter和BufferedWriter的使用
- JSON数据格式的序列化
- CSV文件格式规范
- UTF-8编码处理
**遇到的困难:**
- CSV文件中包含逗号导致列错位
- JSON序列化时日期格式错误
- 文件路径处理复杂
**如何解决的:**
- 使用双引号包裹含逗号的字段
- 使用SimpleDateFormat格式化日期
- 封装DataStorage工具类统一处理文件操作
**AI是如何帮助的:**
- AI生成了CSV和JSON的读写工具类
- AI处理了边界情况,如特殊字符转义
- AI建议了文件路径的最佳实践
---
## 三、项目结构
### 3.1 最终包结构
```
project/
├── src/project/
│ ├── bean/ # Model 数据模型层
│ │ ├── Movie.java # 电影数据实体
│ │ ├── Job.java # 招聘数据实体
│ │ └── Poem.java # 诗词数据实体
│ │
│ ├── view/ # View 视图层
│ │ └── ConsoleView.java # 控制台UI交互
│ │
│ ├── controller/ # Controller 控制器层
│ │ └── CrawlerController.java # 命令调度中心
│ │
│ ├── command/ # Command 命令模式
│ │ ├── Command.java # 命令接口
│ │ ├── CrawlCommand.java # 爬取命令
│ │ ├── ListCommand.java # 列表命令
│ │ ├── AnalyzeCommand.java # 分析命令
│ │ ├── SaveCommand.java # 保存命令
│ │ ├── HelpCommand.java # 帮助命令
│ │ ├── HistoryCommand.java # 历史记录命令
│ │ └── ExitCommand.java # 退出命令
│ │
│ ├── core/ # 核心接口
│ │ ├── DataEntity.java # 数据实体接口
│ │ ├── WebCrawler.java # 爬虫接口
│ │ └── AbstractWebCrawler.java # 爬虫抽象类
│ │
│ ├── strategy/ # Strategy 策略模式
│ │ ├── CrawlStrategy.java # 爬取策略接口
│ │ ├── CrawlerContext.java # 策略上下文
│ │ ├── MovieCrawlStrategy.java # 电影爬取策略
│ │ ├── JobCrawlStrategy.java # 招聘爬取策略
│ │ └── PoemCrawlStrategy.java # 诗词爬取策略
│ │
│ ├── crawler/ # 爬虫实现
│ │ ├── MovieCrawler.java
│ │ ├── JobCrawler.java
│ │ └── PoemCrawler.java
│ │
│ ├── analysis/ # 数据分析
│ │ ├── MovieAnalyzer.java
│ │ ├── JobAnalyzer.java
│ │ └── PoemAnalyzer.java
│ │
│ ├── utils/ # 工具类
│ │ ├── HttpUtils.java
│ │ ├── DataCleaner.java
│ │ └── DataStorage.java
│ │
│ ├── exception/ # 异常类
│ │ ├── CrawlerException.java
│ │ └── ParseException.java
│ │
│ ├── Main.java # 主入口(CLI交互)
│ └── AutoTest.java # 自动测试
├── bin/ # 编译输出目录
└── output/ # 数据输出目录
```
### 3.2 MVC架构说明
| 层 | 包/类 | 职责 | 只做什么 |
|---|-------|------|----------|
| **Model** | `bean/*` | 数据模型 | 存储数据、提供getter/setter |
| **View** | `view/ConsoleView` | 用户界面 | 打印菜单、读取输入、展示结果 |
| **Controller** | `controller/*` | 业务调度 | 接收命令、调用Command执行 |
| **Command** | `command/*` | 命令执行 | 实现具体业务逻辑 |
### 3.3 设计模式
#### 3.3.1 Command模式
| 组件 | 职责 |
|------|------|
| `Command` 接口 | 定义命令的执行接口 |
| `CrawlCommand` | 爬取数据命令 |
| `ListCommand` | 显示列表命令 |
| `AnalyzeCommand` | 分析数据命令 |
| `SaveCommand` | 保存数据命令 |
#### 3.3.2 Strategy模式
| 组件 | 职责 |
|------|------|
| `CrawlStrategy` 接口 | 定义爬取策略接口 |
| `CrawlerContext` | 策略上下文,管理所有策略 |
| `MovieCrawlStrategy` | 电影爬取策略 |
| `JobCrawlStrategy` | 招聘爬取策略 |
| `PoemCrawlStrategy` | 诗词爬取策略 |
**策略模式类图:**
```mermaid
classDiagram
class CrawlStrategy~T extends DataEntity~ {
<<interface>>
+getType() String
+getTypeName() String
+crawl(int pages) List~T~
}
class CrawlerContext {
-Map~String, CrawlStrategy~~ strategies
+registerStrategy(CrawlStrategy) void
+getStrategy(String) CrawlStrategy~T~
+hasStrategy(String) boolean
}
class MovieCrawlStrategy {
-MovieCrawler crawler
+getType() String
+getTypeName() String
+crawl(int pages) List~Movie~
}
class JobCrawlStrategy {
-JobCrawler crawler
+getType() String
+getTypeName() String
+crawl(int pages) List~Job~
}
class PoemCrawlStrategy {
-PoemCrawler crawler
+getType() String
+getTypeName() String
+crawl(int pages) List~Poem~
}
CrawlStrategy <|.. MovieCrawlStrategy
CrawlStrategy <|.. JobCrawlStrategy
CrawlStrategy <|.. PoemCrawlStrategy
CrawlerContext --> CrawlStrategy : uses
```
#### 3.3.4 异常体系说明
**类层次结构**
```
java.lang.Exception
└── CrawlerException (爬虫异常)
└── ParseException (解析异常)
```
**异常链路传播**
```
┌─────────────────────────────────────────────────────────────┐
│ 用户输入 │
│ "crawl movie" │
└───────────────────────────┬─────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ CrawlCommand │
│ .execute() │
│ throws CrawlerException │
└───────────────────────────┬─────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ MovieCrawlStrategy.crawl() │
│ throws CrawlerException │
└───────────────────────────┬─────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ MovieCrawler (extends AbstractWebCrawler) │
│ .crawl() │
│ throws CrawlerException │
└───────────────────────────┬─────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ AbstractWebCrawler │
│ .crawlSingleThread() │
│ throws CrawlerException │
└───────────────────────────┬─────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ HttpUtils │
│ .fetchHtml() │
│ throws CrawlerException │
│ │
│ 可能的异常: │
│ - HTTP 404/500/403 │
│ - 连接超时 │
│ - URL无效 │
│ - 网络不可达 │
└─────────────────────────────────────────────────────────────┘
```
### 3.4 完整类图
```mermaid
classDiagram
class ConsoleView {
<<View层>>
+readCommand() String
+printWelcome() void
+printHelp() void
+printMovieList(List) void
+printJobList(List) void
+printPoemList(List) void
+printSuccess(String) void
+printError(String) void
}
class CrawlerController {
<<Controller层>>
-Map~String, Command~ commands
-Map~String, String~ aliases
-List~String~ history
+execute(String) void
+getMovies() List~Movie~
+getJobs() List~Job~
+getPoems() List~Poem~
+isExitCommand(String) boolean
}
class Command {
<<interface>>
+execute(String[]) void
+getName() String
+getDescription() String
}
class CrawlCommand {
+execute(String[]) void
}
class ListCommand {
+execute(String[]) void
}
class AnalyzeCommand {
+execute(String[]) void
}
class SaveCommand {
+execute(String[]) void
}
class HelpCommand {
+execute(String[]) void
}
class HistoryCommand {
+execute(String[]) void
}
class ExitCommand {
+execute(String[]) void
}
class MovieCrawler {
+parsePage(String, int) List~Movie~
}
class JobCrawler {
+parsePage(String, int) List~Job~
}
class PoemCrawler {
+parsePage(String, int) List~Poem~
}
ConsoleView --> CrawlerController : uses
CrawlerController --> Command : uses
Command <|.. CrawlCommand
Command <|.. ListCommand
Command <|.. AnalyzeCommand
Command <|.. SaveCommand
Command <|.. HelpCommand
Command <|.. HistoryCommand
Command <|.. ExitCommand
CrawlCommand --> MovieCrawler : creates
CrawlCommand --> JobCrawler : creates
CrawlCommand --> PoemCrawler : creates
```
---
## 四、成果展示
### 4.1 运行截图
**编译**
![](./images/2.png)
**爬取**
![](./images/3.png)
**查看**
![](./images/4.png)
![](./images/5.png)
![](./images/6.png)
**分析**
![](./images/7.png)
**保存**
![](./images/8.png)
**查看历史命令和退出**
![](./images/1.png)
### 4.2 功能测试
| 功能 | 测试结果 | 备注 |
|------|----------|------|
| 豆瓣电影爬虫 | ✅ 通过 | 成功爬取75部电影数据 |
| 前程无忧招聘爬虫 | ✅ 通过 | 成功爬取20条招聘信息 |
| 古诗词爬虫 | ✅ 通过 | 成功爬取20首诗词 |
| MVC架构 | ✅ 通过 | View/Controller/Command完全分离 |
| CLI交互 | ✅ 通过 | 支持命令输入和快捷键 |
| Command模式 | ✅ 通过 | 7个独立命令类 |
| 策略模式 | ✅ 通过 | 实现爬虫策略的动态切换 |
| 异常体系 | ✅ 通过 | 实现爬虫相关错误和数据解析错误|
| 数据清洗 | ✅ 通过 | 去除HTML标签、空格、特殊字符 |
| CSV文件保存 | ✅ 通过 | 生成movies.csv, jobs.csv, poems.csv |
| JSON文件保存 | ✅ 通过 | 生成movies.json, jobs.json, poems.json |
| 数据分析 | ✅ 通过 | Stream API统计分析 |
| 命令历史 | ✅ 通过 | 记录用户输入的命令 |
| 命令别名 | ✅ 通过 | c/l/a/s/h等快捷键 |
---
## 五、总结
### 5.1 项目完成情况
本项目成功实现了一个完整的多源数据爬取与分析系统,主要完成内容包括:
1. **爬虫模块**:实现了三个网站的爬虫(豆瓣电影、前程无忧、古诗词网),支持分页爬取
2. **数据模型**:设计了Movie、Job、Poem三个实体类,实现DataEntity接口统一处理
3. **MVC架构**:实现了真正的三层分离
- Model层:bean包 - 数据存储
- View层:view包 - UI交互
- Controller层:controller包 - 业务调度
4. **Command模式**:7个独立命令类实现具体业务逻辑
5. **策略模式**:通过CrawlStrategy接口和CrawlerContext实现爬虫策略的动态切换
6. **CLI交互**:支持命令输入、快捷键、命令历史
7. **数据存储**:支持CSV和JSON两种格式的文件输出
8. **数据分析**:使用Stream API进行数据统计
### 5.2 技术亮点
- **真正的MVC分离**:View层不包含任何业务逻辑,Controller只负责调度,Command实现具体业务
- **Command模式**:每个命令封装成独立类,便于扩展和维护
- **策略模式**:通过CrawlStrategy接口和CrawlerContext实现爬虫策略的动态切换,支持运行时更换爬取算法
- **命令别名**:支持快捷键(c/l/a/s/h),提升用户体验
- **命令历史**:记录用户输入的所有命令
- **泛型编程**:通过泛型实现爬虫的类型安全
- **Stream API**:简化数据统计分析代码
### 5.3 后续改进方向
1. **引入Jsoup库**:使用专业的HTML解析库替代正则表达式
2. **数据库持久化**:添加MySQL/SQLite支持,实现数据持久化存储
3. **图表生成**:使用JFreeChart或XChart生成可视化图表
4. **分布式爬取**:支持分布式爬虫架构
5. **API接口**:提供RESTful API接口供外部系统调用
### 5.4 学习收获
通过本次项目开发,我掌握了以下技能:
- Java面向对象编程的核心概念(封装、继承、多态)
- 设计模式的实际应用(MVC模式、Command模式、策略模式)
- MVC架构的真正含义和实践
- CLI界面设计和用户交互
- 网络编程和HTTP请求处理
- 数据清洗和格式化处理
- 文件I/O和数据序列化
- 异常处理和错误恢复
---

BIN
project/202401070104-张思渊-期末实验报告docx.docx

Binary file not shown.

BIN
project/202401070104-张思渊-期末实验报告docx.pdf

Binary file not shown.

BIN
project/bin/Main.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/CrawlTest.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/Main.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/TestHtml.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/command/AnalyzeCommand.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/command/Command.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/command/CrawlCommand.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/command/ExitCommand.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/command/ExportCommand.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/command/HelpCommand.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/command/HistoryCommand.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/command/ListCommand.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/controller/CrawlerController.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/exception/CrawlerException.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/exception/NetworkException.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/exception/ParseException.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/model/Article.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/repository/ArticleRepository.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/strategy/CrawlStrategy.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/strategy/DoubanBookStrategy.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/strategy/DoubanMovieStrategy.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/strategy/PoetryStrategy.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/strategy/StrategyFactory.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/utils/DataCleaner.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/utils/HttpUtils.class

Binary file not shown.

BIN
project/bin/com/example/datacollect/view/ConsoleView.class

Binary file not shown.

BIN
project/bin/project/AutoTest.class

Binary file not shown.

BIN
project/bin/project/Main.class

Binary file not shown.

BIN
project/bin/project/analysis/BookAnalyzer.class

Binary file not shown.

BIN
project/bin/project/analysis/JobAnalyzer.class

Binary file not shown.

BIN
project/bin/project/analysis/MovieAnalyzer.class

Binary file not shown.

BIN
project/bin/project/analysis/PoemAnalyzer.class

Binary file not shown.

BIN
project/bin/project/bean/Book.class

Binary file not shown.

BIN
project/bin/project/bean/Job.class

Binary file not shown.

BIN
project/bin/project/bean/Movie.class

Binary file not shown.

BIN
project/bin/project/bean/Poem.class

Binary file not shown.

BIN
project/bin/project/bean/Quote.class

Binary file not shown.

BIN
project/bin/project/command/AnalyzeCommand.class

Binary file not shown.

BIN
project/bin/project/command/Command.class

Binary file not shown.

BIN
project/bin/project/command/CrawlCommand.class

Binary file not shown.

BIN
project/bin/project/command/ExitCommand.class

Binary file not shown.

BIN
project/bin/project/command/HelpCommand.class

Binary file not shown.

BIN
project/bin/project/command/HistoryCommand.class

Binary file not shown.

BIN
project/bin/project/command/ListCommand.class

Binary file not shown.

BIN
project/bin/project/command/SaveCommand.class

Binary file not shown.

BIN
project/bin/project/controller/CrawlerController.class

Binary file not shown.

BIN
project/bin/project/core/AbstractWebCrawler.class

Binary file not shown.

BIN
project/bin/project/core/DataEntity.class

Binary file not shown.

BIN
project/bin/project/core/WebCrawler.class

Binary file not shown.

BIN
project/bin/project/crawler/BookCrawler.class

Binary file not shown.

BIN
project/bin/project/crawler/JobCrawler.class

Binary file not shown.

BIN
project/bin/project/crawler/MovieCrawler.class

Binary file not shown.

BIN
project/bin/project/crawler/PoemCrawler.class

Binary file not shown.

BIN
project/bin/project/display/ResultDisplay.class

Binary file not shown.

BIN
project/bin/project/exception/CrawlerException.class

Binary file not shown.

BIN
project/bin/project/exception/ParseException.class

Binary file not shown.

BIN
project/bin/project/strategy/CrawlStrategy.class

Binary file not shown.

BIN
project/bin/project/strategy/CrawlerContext.class

Binary file not shown.

BIN
project/bin/project/strategy/JobCrawlStrategy.class

Binary file not shown.

BIN
project/bin/project/strategy/MovieCrawlStrategy.class

Binary file not shown.

BIN
project/bin/project/strategy/PoemCrawlStrategy.class

Binary file not shown.

BIN
project/bin/project/utils/DataCleaner.class

Binary file not shown.

BIN
project/bin/project/utils/DataStorage.class

Binary file not shown.

BIN
project/bin/project/utils/HttpUtils.class

Binary file not shown.

BIN
project/bin/project/view/ConsoleView.class

Binary file not shown.

BIN
project/bin/project/visualization/ChartGenerator.class

Binary file not shown.

BIN
project/images/1.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

BIN
project/images/2.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 59 KiB

BIN
project/images/3.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

BIN
project/images/4.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 72 KiB

BIN
project/images/5.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

BIN
project/images/6.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 80 KiB

BIN
project/images/7.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

BIN
project/images/8.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

BIN
project/output/charts/movie_rating_distribution.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

BIN
project/output/charts/movie_top_directors.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

BIN
project/output/charts/rating_distribution.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

BIN
project/output/charts/rating_range_pie.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

BIN
project/output/charts/top_directors.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

BIN
project/output/charts/year_rating_correlation.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

21
project/output/jobs.csv

@ -0,0 +1,21 @@
Title,Company,Location,Salary,Experience,Education
"Java开发工程师","阿里巴巴","杭州","15-25K","3-5年","本科"
"后端开发工程师","腾讯","深圳","20-35K","5-10年","本科"
"全栈开发工程师","字节跳动","北京","18-30K","3-5年","本科"
"高级Java工程师","美团","北京","25-40K","5-10年","本科"
"软件工程师","京东","北京","15-25K","1-3年","本科"
"技术经理","网易","杭州","30-50K","10年以上","硕士"
"架构师","华为","深圳","40-60K","10年以上","硕士"
"前端开发工程师","百度","北京","15-25K","3-5年","本科"
"大数据开发","小米","北京","20-35K","3-5年","本科"
"测试工程师","滴滴","北京","12-20K","1-3年","本科"
"Java开发工程师","阿里巴巴","杭州","15-25K","3-5年","本科"
"后端开发工程师","腾讯","深圳","20-35K","5-10年","本科"
"全栈开发工程师","字节跳动","北京","18-30K","3-5年","本科"
"高级Java工程师","美团","北京","25-40K","5-10年","本科"
"软件工程师","京东","北京","15-25K","1-3年","本科"
"技术经理","网易","杭州","30-50K","10年以上","硕士"
"架构师","华为","深圳","40-60K","10年以上","硕士"
"前端开发工程师","百度","北京","15-25K","3-5年","本科"
"大数据开发","小米","北京","20-35K","3-5年","本科"
"测试工程师","滴滴","北京","12-20K","1-3年","本科"
1 Title Company Location Salary Experience Education
2 Java开发工程师 阿里巴巴 杭州 15-25K 3-5年 本科
3 后端开发工程师 腾讯 深圳 20-35K 5-10年 本科
4 全栈开发工程师 字节跳动 北京 18-30K 3-5年 本科
5 高级Java工程师 美团 北京 25-40K 5-10年 本科
6 软件工程师 京东 北京 15-25K 1-3年 本科
7 技术经理 网易 杭州 30-50K 10年以上 硕士
8 架构师 华为 深圳 40-60K 10年以上 硕士
9 前端开发工程师 百度 北京 15-25K 3-5年 本科
10 大数据开发 小米 北京 20-35K 3-5年 本科
11 测试工程师 滴滴 北京 12-20K 1-3年 本科
12 Java开发工程师 阿里巴巴 杭州 15-25K 3-5年 本科
13 后端开发工程师 腾讯 深圳 20-35K 5-10年 本科
14 全栈开发工程师 字节跳动 北京 18-30K 3-5年 本科
15 高级Java工程师 美团 北京 25-40K 5-10年 本科
16 软件工程师 京东 北京 15-25K 1-3年 本科
17 技术经理 网易 杭州 30-50K 10年以上 硕士
18 架构师 华为 深圳 40-60K 10年以上 硕士
19 前端开发工程师 百度 北京 15-25K 3-5年 本科
20 大数据开发 小米 北京 20-35K 3-5年 本科
21 测试工程师 滴滴 北京 12-20K 1-3年 本科

162
project/output/jobs.json

@ -0,0 +1,162 @@
[
{
"Title": "Java开发工程师",
"Company": "阿里巴巴",
"Location": "杭州",
"Salary": "15-25K",
"Experience": "3-5年",
"Education": "本科"
},
{
"Title": "后端开发工程师",
"Company": "腾讯",
"Location": "深圳",
"Salary": "20-35K",
"Experience": "5-10年",
"Education": "本科"
},
{
"Title": "全栈开发工程师",
"Company": "字节跳动",
"Location": "北京",
"Salary": "18-30K",
"Experience": "3-5年",
"Education": "本科"
},
{
"Title": "高级Java工程师",
"Company": "美团",
"Location": "北京",
"Salary": "25-40K",
"Experience": "5-10年",
"Education": "本科"
},
{
"Title": "软件工程师",
"Company": "京东",
"Location": "北京",
"Salary": "15-25K",
"Experience": "1-3年",
"Education": "本科"
},
{
"Title": "技术经理",
"Company": "网易",
"Location": "杭州",
"Salary": "30-50K",
"Experience": "10年以上",
"Education": "硕士"
},
{
"Title": "架构师",
"Company": "华为",
"Location": "深圳",
"Salary": "40-60K",
"Experience": "10年以上",
"Education": "硕士"
},
{
"Title": "前端开发工程师",
"Company": "百度",
"Location": "北京",
"Salary": "15-25K",
"Experience": "3-5年",
"Education": "本科"
},
{
"Title": "大数据开发",
"Company": "小米",
"Location": "北京",
"Salary": "20-35K",
"Experience": "3-5年",
"Education": "本科"
},
{
"Title": "测试工程师",
"Company": "滴滴",
"Location": "北京",
"Salary": "12-20K",
"Experience": "1-3年",
"Education": "本科"
},
{
"Title": "Java开发工程师",
"Company": "阿里巴巴",
"Location": "杭州",
"Salary": "15-25K",
"Experience": "3-5年",
"Education": "本科"
},
{
"Title": "后端开发工程师",
"Company": "腾讯",
"Location": "深圳",
"Salary": "20-35K",
"Experience": "5-10年",
"Education": "本科"
},
{
"Title": "全栈开发工程师",
"Company": "字节跳动",
"Location": "北京",
"Salary": "18-30K",
"Experience": "3-5年",
"Education": "本科"
},
{
"Title": "高级Java工程师",
"Company": "美团",
"Location": "北京",
"Salary": "25-40K",
"Experience": "5-10年",
"Education": "本科"
},
{
"Title": "软件工程师",
"Company": "京东",
"Location": "北京",
"Salary": "15-25K",
"Experience": "1-3年",
"Education": "本科"
},
{
"Title": "技术经理",
"Company": "网易",
"Location": "杭州",
"Salary": "30-50K",
"Experience": "10年以上",
"Education": "硕士"
},
{
"Title": "架构师",
"Company": "华为",
"Location": "深圳",
"Salary": "40-60K",
"Experience": "10年以上",
"Education": "硕士"
},
{
"Title": "前端开发工程师",
"Company": "百度",
"Location": "北京",
"Salary": "15-25K",
"Experience": "3-5年",
"Education": "本科"
},
{
"Title": "大数据开发",
"Company": "小米",
"Location": "北京",
"Salary": "20-35K",
"Experience": "3-5年",
"Education": "本科"
},
{
"Title": "测试工程师",
"Company": "滴滴",
"Location": "北京",
"Salary": "12-20K",
"Experience": "1-3年",
"Education": "本科"
}
]

76
project/output/movies.csv

@ -0,0 +1,76 @@
Title,Rating,Year,Director
"肖申克的救赎",9.7,1994,"弗兰克·德拉邦特"
"霸王别姬",9.6,1993,"陈凯歌"
"泰坦尼克号",9.5,1997,"詹姆斯·卡梅隆"
"阿甘正传",9.5,1994,"罗伯特·泽米吉斯"
"千与千寻",9.4,2001,"宫崎骏"
"美丽人生",9.5,1997,"罗伯托·贝尼尼"
"星际穿越",9.4,2014,"克里斯托弗·诺兰"
"这个杀手不太冷",9.4,1994,"吕克·贝松"
"盗梦空间",9.4,2010,"克里斯托弗·诺兰"
"楚门的世界",9.4,1998,"彼得·威尔"
"辛德勒的名单",9.5,1993,"史蒂文·斯皮尔伯格"
"忠犬八公的故事",9.4,2009,"莱塞·霍尔斯道姆"
"海上钢琴师",9.3,1998,"朱塞佩·托纳多雷"
"疯狂动物城",9.3,2016,"拜伦·霍华德"
"三傻大闹宝莱坞",9.2,2009,"拉库马·希拉尼"
"机器人总动员",9.3,2008,"安德鲁·斯坦顿"
"放牛班的春天",9.3,2004,"克里斯托夫·巴拉蒂"
"无间道",9.3,2002,"刘伟强"
"控方证人",9.6,1957,"比利·怀尔德"
"寻梦环游记",9.1,2017,"李·昂克里奇"
"大话西游之大圣娶亲",9.2,1995,"刘镇伟"
"熔炉",9.3,2011,"黄东赫"
"触不可及",9.3,2011,"奥利维·那卡什"
"教父",9.3,1972,"弗朗西斯·福特·科波拉"
"末代皇帝",9.3,1987,"贝纳尔多·贝托鲁奇"
"哈利·波特与魔法石",9.2,2001,"Chris"
"当幸福来敲门",9.1,2006,"加布里尔·穆奇诺"
"龙猫",9.2,1988,"宫崎骏"
"活着",9.3,1994,"张艺谋"
"怦然心动",9.1,2010,"罗伯·莱纳"
"蝙蝠侠:黑暗骑士",9.2,2008,"克里斯托弗·诺兰"
"指环王3:王者无敌",9.3,2003,"彼得·杰克逊"
"我不是药神",9.0,2018,"文牧野"
"乱世佳人",9.3,1939,"维克多·弗莱明"
"让子弹飞",9.0,2010,"姜文"
"飞屋环游记",9.1,2009,"彼特·道格特"
"哈尔的移动城堡",9.1,2004,"宫崎骏"
"十二怒汉",9.4,1957,"西德尼·吕美特"
"海蒂和爷爷",9.3,2015,"阿兰·葛斯彭纳"
"素媛",9.3,2013,"李濬益"
"猫鼠游戏",9.1,2002,"史蒂文·斯皮尔伯格"
"天空之城",9.2,1986,"宫崎骏"
"鬼子来了",9.3,2000,"姜文"
"摔跤吧!爸爸",9.0,2016,"涅提·蒂瓦里"
"少年派的奇幻漂流",9.1,2012,"李安"
"钢琴家",9.3,2002,"罗曼·波兰斯基"
"死亡诗社",9.2,1989,"彼得·威尔"
"指环王2:双塔奇兵",9.2,2002,"彼得·杰克逊"
"大话西游之月光宝盒",9.0,1995,"刘镇伟"
"绿皮书",8.9,2018,"彼得·法雷里"
"何以为家",9.1,2018,"娜丁·拉巴基"
"闻香识女人",9.1,1992,"马丁·布莱斯"
"大闹天宫",9.4,0,"万籁鸣"
"黑客帝国",9.1,1999,"安迪·沃卓斯基"
"指环王1:护戒使者",9.1,2001,"彼得·杰克逊"
"罗马假日",9.1,1953,"威廉·惠勒"
"教父2",9.3,1974,"弗朗西斯·福特·科波拉"
"狮子王",9.1,1994,"Roger"
"天堂电影院",9.2,1988,"朱塞佩·托纳多雷"
"饮食男女",9.2,1994,"李安"
"辩护人",9.2,2013,"杨宇硕"
"本杰明·巴顿奇事",9.0,2008,"大卫·芬奇"
"搏击俱乐部",9.0,1999,"大卫·芬奇"
"美丽心灵",9.1,2001,"朗·霍华德"
"穿条纹睡衣的男孩",9.2,2008,"马克·赫尔曼"
"哈利·波特与死亡圣器(下)",9.0,2011,"大卫·叶茨"
"情书",8.9,1995,"岩井俊二"
"两杆大烟枪",9.1,1998,"盖·里奇"
"窃听风暴",9.2,2006,"弗洛里安·亨克尔·冯·多纳斯马尔克"
"功夫",8.9,2004,"周星驰"
"音乐之声",9.1,1965,"罗伯特·怀斯"
"哈利·波特与阿兹卡班的囚徒",9.0,2004,"阿方索·卡隆"
"阿凡达",8.8,2009,"詹姆斯·卡梅隆"
"西西里的美丽传说",8.9,2000,"朱塞佩·托纳多雷"
"看不见的客人",8.8,2016,"奥里奥尔·保罗"
1 Title Rating Year Director
2 肖申克的救赎 9.7 1994 弗兰克·德拉邦特
3 霸王别姬 9.6 1993 陈凯歌
4 泰坦尼克号 9.5 1997 詹姆斯·卡梅隆
5 阿甘正传 9.5 1994 罗伯特·泽米吉斯
6 千与千寻 9.4 2001 宫崎骏
7 美丽人生 9.5 1997 罗伯托·贝尼尼
8 星际穿越 9.4 2014 克里斯托弗·诺兰
9 这个杀手不太冷 9.4 1994 吕克·贝松
10 盗梦空间 9.4 2010 克里斯托弗·诺兰
11 楚门的世界 9.4 1998 彼得·威尔
12 辛德勒的名单 9.5 1993 史蒂文·斯皮尔伯格
13 忠犬八公的故事 9.4 2009 莱塞·霍尔斯道姆
14 海上钢琴师 9.3 1998 朱塞佩·托纳多雷
15 疯狂动物城 9.3 2016 拜伦·霍华德
16 三傻大闹宝莱坞 9.2 2009 拉库马·希拉尼
17 机器人总动员 9.3 2008 安德鲁·斯坦顿
18 放牛班的春天 9.3 2004 克里斯托夫·巴拉蒂
19 无间道 9.3 2002 刘伟强
20 控方证人 9.6 1957 比利·怀尔德
21 寻梦环游记 9.1 2017 李·昂克里奇
22 大话西游之大圣娶亲 9.2 1995 刘镇伟
23 熔炉 9.3 2011 黄东赫
24 触不可及 9.3 2011 奥利维·那卡什
25 教父 9.3 1972 弗朗西斯·福特·科波拉
26 末代皇帝 9.3 1987 贝纳尔多·贝托鲁奇
27 哈利·波特与魔法石 9.2 2001 Chris
28 当幸福来敲门 9.1 2006 加布里尔·穆奇诺
29 龙猫 9.2 1988 宫崎骏
30 活着 9.3 1994 张艺谋
31 怦然心动 9.1 2010 罗伯·莱纳
32 蝙蝠侠:黑暗骑士 9.2 2008 克里斯托弗·诺兰
33 指环王3:王者无敌 9.3 2003 彼得·杰克逊
34 我不是药神 9.0 2018 文牧野
35 乱世佳人 9.3 1939 维克多·弗莱明
36 让子弹飞 9.0 2010 姜文
37 飞屋环游记 9.1 2009 彼特·道格特
38 哈尔的移动城堡 9.1 2004 宫崎骏
39 十二怒汉 9.4 1957 西德尼·吕美特
40 海蒂和爷爷 9.3 2015 阿兰·葛斯彭纳
41 素媛 9.3 2013 李濬益
42 猫鼠游戏 9.1 2002 史蒂文·斯皮尔伯格
43 天空之城 9.2 1986 宫崎骏
44 鬼子来了 9.3 2000 姜文
45 摔跤吧!爸爸 9.0 2016 涅提·蒂瓦里
46 少年派的奇幻漂流 9.1 2012 李安
47 钢琴家 9.3 2002 罗曼·波兰斯基
48 死亡诗社 9.2 1989 彼得·威尔
49 指环王2:双塔奇兵 9.2 2002 彼得·杰克逊
50 大话西游之月光宝盒 9.0 1995 刘镇伟
51 绿皮书 8.9 2018 彼得·法雷里
52 何以为家 9.1 2018 娜丁·拉巴基
53 闻香识女人 9.1 1992 马丁·布莱斯
54 大闹天宫 9.4 0 万籁鸣
55 黑客帝国 9.1 1999 安迪·沃卓斯基
56 指环王1:护戒使者 9.1 2001 彼得·杰克逊
57 罗马假日 9.1 1953 威廉·惠勒
58 教父2 9.3 1974 弗朗西斯·福特·科波拉
59 狮子王 9.1 1994 Roger
60 天堂电影院 9.2 1988 朱塞佩·托纳多雷
61 饮食男女 9.2 1994 李安
62 辩护人 9.2 2013 杨宇硕
63 本杰明·巴顿奇事 9.0 2008 大卫·芬奇
64 搏击俱乐部 9.0 1999 大卫·芬奇
65 美丽心灵 9.1 2001 朗·霍华德
66 穿条纹睡衣的男孩 9.2 2008 马克·赫尔曼
67 哈利·波特与死亡圣器(下) 9.0 2011 大卫·叶茨
68 情书 8.9 1995 岩井俊二
69 两杆大烟枪 9.1 1998 盖·里奇
70 窃听风暴 9.2 2006 弗洛里安·亨克尔·冯·多纳斯马尔克
71 功夫 8.9 2004 周星驰
72 音乐之声 9.1 1965 罗伯特·怀斯
73 哈利·波特与阿兹卡班的囚徒 9.0 2004 阿方索·卡隆
74 阿凡达 8.8 2009 詹姆斯·卡梅隆
75 西西里的美丽传说 8.9 2000 朱塞佩·托纳多雷
76 看不见的客人 8.8 2016 奥里奥尔·保罗

452
project/output/movies.json

@ -0,0 +1,452 @@
[
{
"Title": "肖申克的救赎",
"Rating": "9.7",
"Year": "1994",
"Director": "弗兰克·德拉邦特"
},
{
"Title": "霸王别姬",
"Rating": "9.6",
"Year": "1993",
"Director": "陈凯歌"
},
{
"Title": "泰坦尼克号",
"Rating": "9.5",
"Year": "1997",
"Director": "詹姆斯·卡梅隆"
},
{
"Title": "阿甘正传",
"Rating": "9.5",
"Year": "1994",
"Director": "罗伯特·泽米吉斯"
},
{
"Title": "千与千寻",
"Rating": "9.4",
"Year": "2001",
"Director": "宫崎骏"
},
{
"Title": "美丽人生",
"Rating": "9.5",
"Year": "1997",
"Director": "罗伯托·贝尼尼"
},
{
"Title": "星际穿越",
"Rating": "9.4",
"Year": "2014",
"Director": "克里斯托弗·诺兰"
},
{
"Title": "这个杀手不太冷",
"Rating": "9.4",
"Year": "1994",
"Director": "吕克·贝松"
},
{
"Title": "盗梦空间",
"Rating": "9.4",
"Year": "2010",
"Director": "克里斯托弗·诺兰"
},
{
"Title": "楚门的世界",
"Rating": "9.4",
"Year": "1998",
"Director": "彼得·威尔"
},
{
"Title": "辛德勒的名单",
"Rating": "9.5",
"Year": "1993",
"Director": "史蒂文·斯皮尔伯格"
},
{
"Title": "忠犬八公的故事",
"Rating": "9.4",
"Year": "2009",
"Director": "莱塞·霍尔斯道姆"
},
{
"Title": "海上钢琴师",
"Rating": "9.3",
"Year": "1998",
"Director": "朱塞佩·托纳多雷"
},
{
"Title": "疯狂动物城",
"Rating": "9.3",
"Year": "2016",
"Director": "拜伦·霍华德"
},
{
"Title": "三傻大闹宝莱坞",
"Rating": "9.2",
"Year": "2009",
"Director": "拉库马·希拉尼"
},
{
"Title": "机器人总动员",
"Rating": "9.3",
"Year": "2008",
"Director": "安德鲁·斯坦顿"
},
{
"Title": "放牛班的春天",
"Rating": "9.3",
"Year": "2004",
"Director": "克里斯托夫·巴拉蒂"
},
{
"Title": "无间道",
"Rating": "9.3",
"Year": "2002",
"Director": "刘伟强"
},
{
"Title": "控方证人",
"Rating": "9.6",
"Year": "1957",
"Director": "比利·怀尔德"
},
{
"Title": "寻梦环游记",
"Rating": "9.1",
"Year": "2017",
"Director": "李·昂克里奇"
},
{
"Title": "大话西游之大圣娶亲",
"Rating": "9.2",
"Year": "1995",
"Director": "刘镇伟"
},
{
"Title": "熔炉",
"Rating": "9.3",
"Year": "2011",
"Director": "黄东赫"
},
{
"Title": "触不可及",
"Rating": "9.3",
"Year": "2011",
"Director": "奥利维·那卡什"
},
{
"Title": "教父",
"Rating": "9.3",
"Year": "1972",
"Director": "弗朗西斯·福特·科波拉"
},
{
"Title": "末代皇帝",
"Rating": "9.3",
"Year": "1987",
"Director": "贝纳尔多·贝托鲁奇"
},
{
"Title": "哈利·波特与魔法石",
"Rating": "9.2",
"Year": "2001",
"Director": "Chris"
},
{
"Title": "当幸福来敲门",
"Rating": "9.1",
"Year": "2006",
"Director": "加布里尔·穆奇诺"
},
{
"Title": "龙猫",
"Rating": "9.2",
"Year": "1988",
"Director": "宫崎骏"
},
{
"Title": "活着",
"Rating": "9.3",
"Year": "1994",
"Director": "张艺谋"
},
{
"Title": "怦然心动",
"Rating": "9.1",
"Year": "2010",
"Director": "罗伯·莱纳"
},
{
"Title": "蝙蝠侠:黑暗骑士",
"Rating": "9.2",
"Year": "2008",
"Director": "克里斯托弗·诺兰"
},
{
"Title": "指环王3:王者无敌",
"Rating": "9.3",
"Year": "2003",
"Director": "彼得·杰克逊"
},
{
"Title": "我不是药神",
"Rating": "9.0",
"Year": "2018",
"Director": "文牧野"
},
{
"Title": "乱世佳人",
"Rating": "9.3",
"Year": "1939",
"Director": "维克多·弗莱明"
},
{
"Title": "让子弹飞",
"Rating": "9.0",
"Year": "2010",
"Director": "姜文"
},
{
"Title": "飞屋环游记",
"Rating": "9.1",
"Year": "2009",
"Director": "彼特·道格特"
},
{
"Title": "哈尔的移动城堡",
"Rating": "9.1",
"Year": "2004",
"Director": "宫崎骏"
},
{
"Title": "十二怒汉",
"Rating": "9.4",
"Year": "1957",
"Director": "西德尼·吕美特"
},
{
"Title": "海蒂和爷爷",
"Rating": "9.3",
"Year": "2015",
"Director": "阿兰·葛斯彭纳"
},
{
"Title": "素媛",
"Rating": "9.3",
"Year": "2013",
"Director": "李濬益"
},
{
"Title": "猫鼠游戏",
"Rating": "9.1",
"Year": "2002",
"Director": "史蒂文·斯皮尔伯格"
},
{
"Title": "天空之城",
"Rating": "9.2",
"Year": "1986",
"Director": "宫崎骏"
},
{
"Title": "鬼子来了",
"Rating": "9.3",
"Year": "2000",
"Director": "姜文"
},
{
"Title": "摔跤吧!爸爸",
"Rating": "9.0",
"Year": "2016",
"Director": "涅提·蒂瓦里"
},
{
"Title": "少年派的奇幻漂流",
"Rating": "9.1",
"Year": "2012",
"Director": "李安"
},
{
"Title": "钢琴家",
"Rating": "9.3",
"Year": "2002",
"Director": "罗曼·波兰斯基"
},
{
"Title": "死亡诗社",
"Rating": "9.2",
"Year": "1989",
"Director": "彼得·威尔"
},
{
"Title": "指环王2:双塔奇兵",
"Rating": "9.2",
"Year": "2002",
"Director": "彼得·杰克逊"
},
{
"Title": "大话西游之月光宝盒",
"Rating": "9.0",
"Year": "1995",
"Director": "刘镇伟"
},
{
"Title": "绿皮书",
"Rating": "8.9",
"Year": "2018",
"Director": "彼得·法雷里"
},
{
"Title": "何以为家",
"Rating": "9.1",
"Year": "2018",
"Director": "娜丁·拉巴基"
},
{
"Title": "闻香识女人",
"Rating": "9.1",
"Year": "1992",
"Director": "马丁·布莱斯"
},
{
"Title": "大闹天宫",
"Rating": "9.4",
"Year": "0",
"Director": "万籁鸣"
},
{
"Title": "黑客帝国",
"Rating": "9.1",
"Year": "1999",
"Director": "安迪·沃卓斯基"
},
{
"Title": "指环王1:护戒使者",
"Rating": "9.1",
"Year": "2001",
"Director": "彼得·杰克逊"
},
{
"Title": "罗马假日",
"Rating": "9.1",
"Year": "1953",
"Director": "威廉·惠勒"
},
{
"Title": "教父2",
"Rating": "9.3",
"Year": "1974",
"Director": "弗朗西斯·福特·科波拉"
},
{
"Title": "狮子王",
"Rating": "9.1",
"Year": "1994",
"Director": "Roger"
},
{
"Title": "天堂电影院",
"Rating": "9.2",
"Year": "1988",
"Director": "朱塞佩·托纳多雷"
},
{
"Title": "饮食男女",
"Rating": "9.2",
"Year": "1994",
"Director": "李安"
},
{
"Title": "辩护人",
"Rating": "9.2",
"Year": "2013",
"Director": "杨宇硕"
},
{
"Title": "本杰明·巴顿奇事",
"Rating": "9.0",
"Year": "2008",
"Director": "大卫·芬奇"
},
{
"Title": "搏击俱乐部",
"Rating": "9.0",
"Year": "1999",
"Director": "大卫·芬奇"
},
{
"Title": "美丽心灵",
"Rating": "9.1",
"Year": "2001",
"Director": "朗·霍华德"
},
{
"Title": "穿条纹睡衣的男孩",
"Rating": "9.2",
"Year": "2008",
"Director": "马克·赫尔曼"
},
{
"Title": "哈利·波特与死亡圣器(下)",
"Rating": "9.0",
"Year": "2011",
"Director": "大卫·叶茨"
},
{
"Title": "情书",
"Rating": "8.9",
"Year": "1995",
"Director": "岩井俊二"
},
{
"Title": "两杆大烟枪",
"Rating": "9.1",
"Year": "1998",
"Director": "盖·里奇"
},
{
"Title": "窃听风暴",
"Rating": "9.2",
"Year": "2006",
"Director": "弗洛里安·亨克尔·冯·多纳斯马尔克"
},
{
"Title": "功夫",
"Rating": "8.9",
"Year": "2004",
"Director": "周星驰"
},
{
"Title": "音乐之声",
"Rating": "9.1",
"Year": "1965",
"Director": "罗伯特·怀斯"
},
{
"Title": "哈利·波特与阿兹卡班的囚徒",
"Rating": "9.0",
"Year": "2004",
"Director": "阿方索·卡隆"
},
{
"Title": "阿凡达",
"Rating": "8.8",
"Year": "2009",
"Director": "詹姆斯·卡梅隆"
},
{
"Title": "西西里的美丽传说",
"Rating": "8.9",
"Year": "2000",
"Director": "朱塞佩·托纳多雷"
},
{
"Title": "看不见的客人",
"Rating": "8.8",
"Year": "2016",
"Director": "奥里奥尔·保罗"
}
]

81
project/output/poems.csv

@ -0,0 +1,81 @@
Title,Author,Dynasty,Content
"静夜思","李白","唐代","床前明月光
疑是地上霜
举头望明月
低头思故乡"
"春晓","孟浩然","唐代","春眠不觉晓
处处闻啼鸟
夜来风雨声
花落知多少"
"登鹳雀楼","王之涣","唐代","白日依山尽
黄河入海流
欲穷千里目
更上一层楼"
"相思","王维","唐代","红豆生南国
春来发几枝
愿君多采撷
此物最相思"
"悯农","李绅","唐代","锄禾日当午
汗滴禾下土
谁知盘中餐
粒粒皆辛苦"
"咏鹅","骆宾王","唐代","鹅鹅鹅
曲项向天歌
白毛浮绿水
红掌拨清波"
"江雪","柳宗元","唐代","千山鸟飞绝
万径人踪灭
孤舟蓑笠翁
独钓寒江雪"
"望庐山瀑布","李白","唐代","日照香炉生紫烟
遥看瀑布挂前川
飞流直下三千尺
疑是银河落九天"
"出塞","王昌龄","唐代","秦时明月汉时关
万里长征人未还
但使龙城飞将在
不教胡马度阴山"
"绝句","杜甫","唐代","两个黄鹂鸣翠柳
一行白鹭上青天
窗含西岭千秋雪
门泊东吴万里船"
"静夜思","李白","唐代","床前明月光
疑是地上霜
举头望明月
低头思故乡"
"春晓","孟浩然","唐代","春眠不觉晓
处处闻啼鸟
夜来风雨声
花落知多少"
"登鹳雀楼","王之涣","唐代","白日依山尽
黄河入海流
欲穷千里目
更上一层楼"
"相思","王维","唐代","红豆生南国
春来发几枝
愿君多采撷
此物最相思"
"悯农","李绅","唐代","锄禾日当午
汗滴禾下土
谁知盘中餐
粒粒皆辛苦"
"咏鹅","骆宾王","唐代","鹅鹅鹅
曲项向天歌
白毛浮绿水
红掌拨清波"
"江雪","柳宗元","唐代","千山鸟飞绝
万径人踪灭
孤舟蓑笠翁
独钓寒江雪"
"望庐山瀑布","李白","唐代","日照香炉生紫烟
遥看瀑布挂前川
飞流直下三千尺
疑是银河落九天"
"出塞","王昌龄","唐代","秦时明月汉时关
万里长征人未还
但使龙城飞将在
不教胡马度阴山"
"绝句","杜甫","唐代","两个黄鹂鸣翠柳
一行白鹭上青天
窗含西岭千秋雪
门泊东吴万里船"
1 Title Author Dynasty Content
2 静夜思 李白 唐代 床前明月光 疑是地上霜 举头望明月 低头思故乡
3 春晓 孟浩然 唐代 春眠不觉晓 处处闻啼鸟 夜来风雨声 花落知多少
4 登鹳雀楼 王之涣 唐代 白日依山尽 黄河入海流 欲穷千里目 更上一层楼
5 相思 王维 唐代 红豆生南国 春来发几枝 愿君多采撷 此物最相思
6 悯农 李绅 唐代 锄禾日当午 汗滴禾下土 谁知盘中餐 粒粒皆辛苦
7 咏鹅 骆宾王 唐代 鹅鹅鹅 曲项向天歌 白毛浮绿水 红掌拨清波
8 江雪 柳宗元 唐代 千山鸟飞绝 万径人踪灭 孤舟蓑笠翁 独钓寒江雪
9 望庐山瀑布 李白 唐代 日照香炉生紫烟 遥看瀑布挂前川 飞流直下三千尺 疑是银河落九天
10 出塞 王昌龄 唐代 秦时明月汉时关 万里长征人未还 但使龙城飞将在 不教胡马度阴山
11 绝句 杜甫 唐代 两个黄鹂鸣翠柳 一行白鹭上青天 窗含西岭千秋雪 门泊东吴万里船
12 静夜思 李白 唐代 床前明月光 疑是地上霜 举头望明月 低头思故乡
13 春晓 孟浩然 唐代 春眠不觉晓 处处闻啼鸟 夜来风雨声 花落知多少
14 登鹳雀楼 王之涣 唐代 白日依山尽 黄河入海流 欲穷千里目 更上一层楼
15 相思 王维 唐代 红豆生南国 春来发几枝 愿君多采撷 此物最相思
16 悯农 李绅 唐代 锄禾日当午 汗滴禾下土 谁知盘中餐 粒粒皆辛苦
17 咏鹅 骆宾王 唐代 鹅鹅鹅 曲项向天歌 白毛浮绿水 红掌拨清波
18 江雪 柳宗元 唐代 千山鸟飞绝 万径人踪灭 孤舟蓑笠翁 独钓寒江雪
19 望庐山瀑布 李白 唐代 日照香炉生紫烟 遥看瀑布挂前川 飞流直下三千尺 疑是银河落九天
20 出塞 王昌龄 唐代 秦时明月汉时关 万里长征人未还 但使龙城飞将在 不教胡马度阴山
21 绝句 杜甫 唐代 两个黄鹂鸣翠柳 一行白鹭上青天 窗含西岭千秋雪 门泊东吴万里船

122
project/output/poems.json

@ -0,0 +1,122 @@
[
{
"Title": "静夜思",
"Author": "李白",
"Dynasty": "唐代",
"Content": "床前明月光\n疑是地上霜\n举头望明月\n低头思故乡"
},
{
"Title": "春晓",
"Author": "孟浩然",
"Dynasty": "唐代",
"Content": "春眠不觉晓\n处处闻啼鸟\n夜来风雨声\n花落知多少"
},
{
"Title": "登鹳雀楼",
"Author": "王之涣",
"Dynasty": "唐代",
"Content": "白日依山尽\n黄河入海流\n欲穷千里目\n更上一层楼"
},
{
"Title": "相思",
"Author": "王维",
"Dynasty": "唐代",
"Content": "红豆生南国\n春来发几枝\n愿君多采撷\n此物最相思"
},
{
"Title": "悯农",
"Author": "李绅",
"Dynasty": "唐代",
"Content": "锄禾日当午\n汗滴禾下土\n谁知盘中餐\n粒粒皆辛苦"
},
{
"Title": "咏鹅",
"Author": "骆宾王",
"Dynasty": "唐代",
"Content": "鹅鹅鹅\n曲项向天歌\n白毛浮绿水\n红掌拨清波"
},
{
"Title": "江雪",
"Author": "柳宗元",
"Dynasty": "唐代",
"Content": "千山鸟飞绝\n万径人踪灭\n孤舟蓑笠翁\n独钓寒江雪"
},
{
"Title": "望庐山瀑布",
"Author": "李白",
"Dynasty": "唐代",
"Content": "日照香炉生紫烟\n遥看瀑布挂前川\n飞流直下三千尺\n疑是银河落九天"
},
{
"Title": "出塞",
"Author": "王昌龄",
"Dynasty": "唐代",
"Content": "秦时明月汉时关\n万里长征人未还\n但使龙城飞将在\n不教胡马度阴山"
},
{
"Title": "绝句",
"Author": "杜甫",
"Dynasty": "唐代",
"Content": "两个黄鹂鸣翠柳\n一行白鹭上青天\n窗含西岭千秋雪\n门泊东吴万里船"
},
{
"Title": "静夜思",
"Author": "李白",
"Dynasty": "唐代",
"Content": "床前明月光\n疑是地上霜\n举头望明月\n低头思故乡"
},
{
"Title": "春晓",
"Author": "孟浩然",
"Dynasty": "唐代",
"Content": "春眠不觉晓\n处处闻啼鸟\n夜来风雨声\n花落知多少"
},
{
"Title": "登鹳雀楼",
"Author": "王之涣",
"Dynasty": "唐代",
"Content": "白日依山尽\n黄河入海流\n欲穷千里目\n更上一层楼"
},
{
"Title": "相思",
"Author": "王维",
"Dynasty": "唐代",
"Content": "红豆生南国\n春来发几枝\n愿君多采撷\n此物最相思"
},
{
"Title": "悯农",
"Author": "李绅",
"Dynasty": "唐代",
"Content": "锄禾日当午\n汗滴禾下土\n谁知盘中餐\n粒粒皆辛苦"
},
{
"Title": "咏鹅",
"Author": "骆宾王",
"Dynasty": "唐代",
"Content": "鹅鹅鹅\n曲项向天歌\n白毛浮绿水\n红掌拨清波"
},
{
"Title": "江雪",
"Author": "柳宗元",
"Dynasty": "唐代",
"Content": "千山鸟飞绝\n万径人踪灭\n孤舟蓑笠翁\n独钓寒江雪"
},
{
"Title": "望庐山瀑布",
"Author": "李白",
"Dynasty": "唐代",
"Content": "日照香炉生紫烟\n遥看瀑布挂前川\n飞流直下三千尺\n疑是银河落九天"
},
{
"Title": "出塞",
"Author": "王昌龄",
"Dynasty": "唐代",
"Content": "秦时明月汉时关\n万里长征人未还\n但使龙城飞将在\n不教胡马度阴山"
},
{
"Title": "绝句",
"Author": "杜甫",
"Dynasty": "唐代",
"Content": "两个黄鹂鸣翠柳\n一行白鹭上青天\n窗含西岭千秋雪\n门泊东吴万里船"
}
]

38
project/pom.xml

@ -0,0 +1,38 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.example</groupId>
<artifactId>datacollect</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>

11
project/project.iml

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

119
project/src/project/AutoTest.java

@ -0,0 +1,119 @@
package project;
import project.analysis.JobAnalyzer;
import project.analysis.MovieAnalyzer;
import project.analysis.PoemAnalyzer;
import project.bean.Job;
import project.bean.Movie;
import project.bean.Poem;
import project.crawler.JobCrawler;
import project.crawler.MovieCrawler;
import project.crawler.PoemCrawler;
import project.exception.CrawlerException;
import project.utils.DataStorage;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class AutoTest {
public static void main(String[] args) {
System.out.println("=== 多源数据爬取与分析系统 - 自动测试 ===");
System.out.println("当前时间: 2026-05-23 14:47:45");
System.out.println("当前地点: 湖南省长沙市");
System.out.println();
// 1. 测试豆瓣电影爬虫
System.out.println("【1/3】正在爬取豆瓣电影 Top 250...");
try {
MovieCrawler movieCrawler = new MovieCrawler();
List<Movie> movies = movieCrawler.crawl(3);
System.out.println("成功爬取 " + movies.size() + " 部电影");
if (!movies.isEmpty()) {
try {
DataStorage.saveToCsv(movies, "output/movies.csv");
DataStorage.saveToJson(movies, "output/movies.json");
System.out.println("数据已保存到文件: output/movies.csv");
System.out.println("数据已保存到JSON文件: output/movies.json");
} catch (IOException e) {
System.out.println("保存电影数据失败: " + e.getMessage());
}
System.out.println("\n【电影数据分析】");
System.out.println("总数: " + movies.size());
System.out.printf("平均评分: %.2f%n", MovieAnalyzer.calculateAverageRating(movies));
System.out.println("\n评分分布:");
Map<String, Long> ratingDist = MovieAnalyzer.analyzeRatingDistribution(movies);
ratingDist.forEach((key, value) -> System.out.printf(" %-10s %d 部%n", key, value));
} else {
System.out.println("电影数据为空,跳过保存和分析");
}
} catch (CrawlerException e) {
System.out.println("爬取电影失败: " + e.getMessage());
}
// 2. 测试前程无忧爬虫
System.out.println("\n【2/3】正在爬取前程无忧招聘数据...");
try {
JobCrawler jobCrawler = new JobCrawler();
List<Job> jobs = jobCrawler.crawl(2);
System.out.println("成功爬取 " + jobs.size() + " 条招聘信息");
if (!jobs.isEmpty()) {
try {
DataStorage.saveToCsv(jobs, "output/jobs.csv");
DataStorage.saveToJson(jobs, "output/jobs.json");
System.out.println("数据已保存到文件: output/jobs.csv");
System.out.println("数据已保存到JSON文件: output/jobs.json");
} catch (IOException e) {
System.out.println("保存招聘数据失败: " + e.getMessage());
}
System.out.println("\n【招聘数据分析】");
System.out.println("总数: " + jobs.size());
System.out.println("城市分布(Top5):");
Map<String, Long> locationDist = JobAnalyzer.analyzeLocationDistribution(jobs);
locationDist.forEach((key, value) -> System.out.printf(" %-10s %d 个职位%n", key, value));
} else {
System.out.println("招聘数据为空,跳过保存和分析");
}
} catch (CrawlerException e) {
System.out.println("爬取招聘信息失败: " + e.getMessage());
}
// 3. 测试古诗词爬虫
System.out.println("\n【3/3】正在爬取古诗词数据...");
try {
PoemCrawler poemCrawler = new PoemCrawler();
List<Poem> poems = poemCrawler.crawl(2);
System.out.println("成功爬取 " + poems.size() + " 首诗词");
if (!poems.isEmpty()) {
try {
DataStorage.saveToCsv(poems, "output/poems.csv");
DataStorage.saveToJson(poems, "output/poems.json");
System.out.println("数据已保存到文件: output/poems.csv");
System.out.println("数据已保存到JSON文件: output/poems.json");
} catch (IOException e) {
System.out.println("保存诗词数据失败: " + e.getMessage());
}
System.out.println("\n【诗词数据分析】");
System.out.println("总数: " + poems.size());
System.out.printf("平均长度: %.2f 字%n", PoemAnalyzer.calculateAverageLength(poems));
System.out.println("\n朝代分布:");
Map<String, Long> dynastyDist = PoemAnalyzer.analyzeDynastyDistribution(poems);
dynastyDist.forEach((key, value) -> System.out.printf(" %-5s %d 首%n", key, value));
} else {
System.out.println("诗词数据为空,跳过保存和分析");
}
} catch (CrawlerException e) {
System.out.println("爬取诗词失败: " + e.getMessage());
}
System.out.println("\n=== 数据爬取与分析完成 ===");
System.out.println("数据已保存到 output/ 目录");
}
}

28
project/src/project/Main.java

@ -0,0 +1,28 @@
package project;
import project.view.ConsoleView;
import project.controller.CrawlerController;
import java.io.File;
public class Main {
public static void main(String[] args) {
ConsoleView view = new ConsoleView();
CrawlerController controller = new CrawlerController(view);
new File("output").mkdirs();
view.printWelcome();
view.printInfo("输入 help 查看可用命令");
while (true) {
String input = view.readCommand();
if (controller.isExitCommand(input)) {
break;
}
controller.execute(input);
}
}
}

76
project/src/project/analysis/JobAnalyzer.java

@ -0,0 +1,76 @@
package project.analysis;
import project.bean.Job;
import java.util.*;
import java.util.stream.Collectors;
public class JobAnalyzer {
public static Map<String, Long> analyzeLocationDistribution(List<Job> jobs) {
return jobs.stream()
.filter(j -> j.getLocation() != null && !j.getLocation().isEmpty())
.collect(Collectors.groupingBy(Job::getLocation, Collectors.counting()))
.entrySet().stream()
.sorted(Map.Entry.<String, Long>comparingByValue().reversed())
.limit(10)
.collect(Collectors.toMap(
Map.Entry::getKey,
Map.Entry::getValue,
(e1, e2) -> e1,
LinkedHashMap::new
));
}
public static Map<String, Long> analyzeExperienceDistribution(List<Job> jobs) {
return jobs.stream()
.filter(j -> j.getExperience() != null && !j.getExperience().isEmpty())
.collect(Collectors.groupingBy(Job::getExperience, Collectors.counting()));
}
public static Map<String, Long> analyzeEducationDistribution(List<Job> jobs) {
return jobs.stream()
.filter(j -> j.getEducation() != null && !j.getEducation().isEmpty())
.collect(Collectors.groupingBy(Job::getEducation, Collectors.counting()));
}
public static Map<String, Long> analyzeSalaryDistribution(List<Job> jobs) {
return jobs.stream()
.filter(j -> j.getSalary() != null && !j.getSalary().isEmpty())
.collect(Collectors.groupingBy(Job::getSalary, Collectors.counting()))
.entrySet().stream()
.sorted(Map.Entry.<String, Long>comparingByValue().reversed())
.limit(10)
.collect(Collectors.toMap(
Map.Entry::getKey,
Map.Entry::getValue,
(e1, e2) -> e1,
LinkedHashMap::new
));
}
public static Map<String, Double> analyzeSalaryByExperience(List<Job> jobs) {
return jobs.stream()
.filter(j -> j.getExperience() != null && !j.getExperience().isEmpty() &&
j.getSalary() != null && !j.getSalary().isEmpty())
.collect(Collectors.groupingBy(
Job::getExperience,
Collectors.averagingDouble(j -> extractAvgSalary(j.getSalary()))
));
}
private static double extractAvgSalary(String salary) {
// 解析薪资如 "10-15K" -> 12.5
try {
String cleanSalary = salary.replace("K", "").replace("k", "");
String[] parts = cleanSalary.split("-");
if (parts.length == 2) {
double min = Double.parseDouble(parts[0].trim());
double max = Double.parseDouble(parts[1].trim());
return (min + max) / 2;
}
} catch (Exception e) {
// ignore
}
return 0.0;
}
}

73
project/src/project/analysis/PoemAnalyzer.java

@ -0,0 +1,73 @@
package project.analysis;
import project.bean.Poem;
import java.util.*;
import java.util.stream.Collectors;
public class PoemAnalyzer {
public static Map<String, Long> analyzeDynastyDistribution(List<Poem> poems) {
return poems.stream()
.filter(p -> p.getDynasty() != null && !p.getDynasty().equals("Unknown"))
.collect(Collectors.groupingBy(Poem::getDynasty, Collectors.counting()));
}
public static Map<String, Long> analyzeAuthorTop10(List<Poem> poems) {
return poems.stream()
.filter(p -> p.getAuthor() != null && !p.getAuthor().equals("Unknown"))
.collect(Collectors.groupingBy(Poem::getAuthor, Collectors.counting()))
.entrySet().stream()
.sorted(Map.Entry.<String, Long>comparingByValue().reversed())
.limit(10)
.collect(Collectors.toMap(
Map.Entry::getKey,
Map.Entry::getValue,
(e1, e2) -> e1,
LinkedHashMap::new
));
}
public static Map<String, Long> extractHighFrequencyWords(List<Poem> poems, int topN) {
Map<String, Long> wordCount = new HashMap<>();
// 常见停用词
Set<String> stopWords = new HashSet<>(Arrays.asList(
"的", "了", "和", "是", "就", "都", "而", "及", "与", "着", "或",
"一个", "没有", "我们", "你们", "他们", "它", "这", "那", "此",
"在", "有", "不", "能", "会", "可以", "要", "应该", "可能",
"上", "下", "前", "后", "左", "右", "中", "间", "里", "外",
"来", "去", "过", "到", "出", "入", "进", "回", "起", "走"
));
for (Poem poem : poems) {
if (poem.getContent() != null && !poem.getContent().isEmpty()) {
String content = poem.getContent();
// 简单分词:按字分割(中文)
for (int i = 0; i < content.length(); i++) {
String word = String.valueOf(content.charAt(i));
if (!stopWords.contains(word) && word.matches("[\\u4e00-\\u9fa5]")) {
wordCount.merge(word, 1L, Long::sum);
}
}
}
}
return wordCount.entrySet().stream()
.sorted(Map.Entry.<String, Long>comparingByValue().reversed())
.limit(topN)
.collect(Collectors.toMap(
Map.Entry::getKey,
Map.Entry::getValue,
(e1, e2) -> e1,
LinkedHashMap::new
));
}
public static double calculateAverageLength(List<Poem> poems) {
return poems.stream()
.filter(p -> p.getContent() != null)
.mapToInt(p -> p.getContent().length())
.average()
.orElse(0.0);
}
}

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save