diff --git a/project/.idea/.gitignore b/project/.idea/.gitignore new file mode 100644 index 0000000..7d05e99 --- /dev/null +++ b/project/.idea/.gitignore @@ -0,0 +1,10 @@ +# 默认忽略的文件 +/shelf/ +/workspace.xml +# 基于编辑器的 HTTP 客户端请求 +/httpRequests/ +# 依赖于环境的 Maven 主目录路径 +/mavenHomeManager.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/project/.idea/.name b/project/.idea/.name new file mode 100644 index 0000000..f1d4957 --- /dev/null +++ b/project/.idea/.name @@ -0,0 +1 @@ +ConsoleView.java \ No newline at end of file diff --git a/project/.idea/misc.xml b/project/.idea/misc.xml new file mode 100644 index 0000000..0548357 --- /dev/null +++ b/project/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/project/.idea/modules.xml b/project/.idea/modules.xml new file mode 100644 index 0000000..2c20a2f --- /dev/null +++ b/project/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/project/.idea/vcs.xml b/project/.idea/vcs.xml new file mode 100644 index 0000000..6c0b863 --- /dev/null +++ b/project/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/project/202401070104-张思渊-期末实验报告.md b/project/202401070104-张思渊-期末实验报告.md new file mode 100644 index 0000000..5095571 --- /dev/null +++ b/project/202401070104-张思渊-期末实验报告.md @@ -0,0 +1,638 @@ +# 《高级程序设计》项目报告: +爬虫项目开发全过程记录 + +## 一、项目目标 + +### 1.1 功能目标 + +| 功能 | 描述 | 优先级 | +|------|------|--------| +| 爬取豆瓣电影数据 | 爬取豆瓣电影Top250的电影标题、评分、年份、导演等信息 | 高 | +| 爬取前程无忧招聘数据 | 爬取Java相关职位的职位名称、公司、薪资、城市、经验要求等信息 | 高 | +| 爬取古诗词数据 | 爬取古诗词网站的诗词标题、作者、朝代、内容等信息 | 高 | +| 数据清洗 | 去除HTML标签、空格、特殊字符,格式化日期,处理缺失值 | 高 | +| 数据存储 | 将清洗后的数据保存为CSV和JSON格式文件 | 高 | +| 数据分析 | 使用Stream API进行统计分析,如评分分布、薪资分析、高频词提取 | 中 | +| CLI交互界面 | 实现命令行交互界面,支持用户输入命令操作 | 中 | +| 结果展示 | 控制台打印统计表格,生成分析报告 | 中 | + +### 1.2 预期效果 + +(1)成功爬取3个不同网站的数据,每个网站至少爬取100条记录。 +(2)数据清洗后保存为结构化文件,便于后续分析。 +(3)通过CLI界面实现交互式操作,支持命令输入。 +(4)提供数据统计分析功能,输出可视化报告。 +(5)实现真正的MVC三层架构分离。 + +--- + +## 二、项目进展 + +### W1:类与对象基础,构造方法与封装 + +**本周任务:** +- 实现Movie实体类,包含title、rating、year、director字段 +- 实现Job实体类,包含title、company、location、salary、experience、education字段 +- 实现Poem实体类,包含title、author、dynasty、content字段 + +**所学知识:** +- Java封装性原理 +- private关键字的使用 +- Getter和Setter方法的设计 +- 构造方法重载 + +**遇到的困难:** +- 觉得Java写Getter/Setter很繁琐,不理解为什么不能像Python一样直接访问属性 + +**如何解决的:** +- 通过查找资料和询问ai,理解了封装是为了数据安全和后期维护,确保数据完整性 + +**AI是如何帮助的:** +- 将Python类代码喂给AI,AI生成了对应的Java代码 +- AI解释了访问修饰符的作用和封装的意义 +- AI建议了接口设计方案,实现数据处理的统一 + +--- + +### W2:继承与方法重写 + +**本周任务:** +- 实现AbstractWebCrawler抽象类,包含crawl()和parse()方法 +- 实现MovieCrawler子类,重写父类方法 +- 实现JobCrawler子类,重写父类方法 +- 实现PoemCrawler子类,重写父类方法 + +**所学知识:** +- extends关键字实现继承 +- @Override注解标记方法重写 +- super关键字调用父类构造方法 +- 抽象类与抽象方法的定义 + +**遇到的困难:** +- 子类构造方法中调用父类构造方法时参数传递错误 +- 抽象方法的实现逻辑不清晰 + +**如何解决的:** +- 查阅Java文档,理解super()必须放在构造方法第一行 +- 分析不同网站的HTML结构,设计针对性的解析逻辑 +- 使用正则表达式提取页面数据 + +**AI是如何帮助的:** +- AI检查了继承关系的合理性 +- AI生成了类图的Mermaid代码,帮助理解类结构 +- AI提供了正则表达式的编写建议 + +--- + +### W3:多态实现 + +**本周任务:** +- 通过父类引用调用不同爬虫的爬取方法 +- 使用List统一管理所有爬虫 +- 实现爬虫的动态切换 + +**所学知识:** +- 向上转型的概念 +- 动态绑定机制 +- instanceof关键字的使用 +- 多态的实际应用场景 + +**遇到的困难:** +- 不理解为什么父类引用可以调用子类重写的方法 +- 不知道如何设计统一的爬虫调度机制 + +**如何解决的:** +- 通过调试代码,观察运行时的方法调用过程 +- 理解了多态的本质是运行时类型识别 +- 设计CrawlerManager统一管理爬虫实例 + +**AI是如何帮助的:** +- AI用生活化的比喻"遥控器控制不同电器"解释了多态的概念 +- AI演示了多态在实际项目中的应用场景 +- AI帮助设计了爬虫管理类的结构 + +--- + +### W4:抽象类与接口 + +**本周任务:** +- 设计ICrawler接口 +- 设计IAnalyzer接口 +- 让AbstractWebCrawler实现ICrawler接口 +- 定义DataEntity接口统一数据访问 + +**所学知识:** +- interface关键字定义接口 +- implements关键字实现接口 +- 接口与抽象类的区别 +- 接口的多实现特性 + +**遇到的困难:** +- 不确定什么时候用抽象类,什么时候用接口 +- 接口方法的设计不够合理 + +**如何解决的:** +- 遵循"is-a用抽象类,has-a/can-do用接口"的原则 +- 将爬虫的通用逻辑放在抽象类中,具体行为定义在接口中 +- 通过小组讨论确定接口设计方案 + +**AI是如何帮助的:** +- AI演示了如何用接口解耦臃肿的代码 +- AI对比了抽象类和接口的使用场景 +- AI建议了合理的接口设计方案 + +--- + +### W5:加入异常处理 + +**本周任务:** +- 自定义CrawlerException异常类 +- 自定义ParseException异常类 +- 在Controller层统一捕获异常 +- 给出友好的错误提示 + +**所学知识:** +- try-catch-finally异常处理结构 +- throws关键字声明异常 +- 自定义异常类的实现 +- 异常继承体系的设计 + +**遇到的困难:** +- 网络请求超时导致程序崩溃,没有友好的错误提示 +- 异常处理逻辑过于分散 + +**如何解决的:** +- 封装了CrawlerException,统一处理爬虫相关异常 +- 在Controller层使用try-catch统一捕获异常 +- 设计异常处理中间件,提供友好的错误提示 + +**AI是如何帮助的:** +- AI生成了异常体系的骨架代码 +- AI建议了合理的异常继承结构 +- AI帮助设计了异常处理的最佳实践 + +--- + +### W6:泛型与集合框架 + +**本周任务:** +- 使用List、List、List管理数据 +- 使用Stream API进行数据统计和分析 +- 使用Map进行数据分组和计数 + +**所学知识:** +- 泛型类和泛型方法 +- List、Map接口的使用 +- Stream API的链式调用 +- Lambda表达式的应用 + +**遇到的困难:** +- Stream API的链式调用容易写错 +- 泛型类型擦除导致编译错误 +- 复杂的数据统计逻辑难以实现 + +**如何解决的:** +- 通过IDE的类型提示逐步修正代码 +- 学习Stream API的常用操作方法 +- 将复杂统计逻辑拆分为多个简单步骤 + +**AI是如何帮助的:** +- AI将一段传统的for循环代码改写为Stream API风格 +- AI提供了Stream API的常用操作示例 +- AI帮助调试泛型相关的编译错误 + +--- + +### W7:实现 CLI + MVC + Command模式 + 策略模式 + +**本周任务:** +- 划分Model/View/Controller职责 +- 实现Command接口和具体命令类 +- 实现策略模式处理不同爬取策略 +- 实现CLI交互界面 + +**所学知识:** +- MVC架构模式 +- Command设计模式 +- Strategy设计模式 +- CLI交互设计原则 + +**遇到的困难:** +- Controller中不小心混入了打印逻辑,违反了MVC原则 +- 命令模式的实现不够灵活 + +**如何解决的:** +- 将打印逻辑移到View层 +- 使用Map存储命令实例,实现命令的动态注册 +- 设计命令别名机制,提高用户体验 + +**AI是如何帮助的:** +- AI检查了代码的MVC划分,指出问题所在 +- AI提供了Command模式的实现模板 +- AI建议了策略模式的设计方案 + +--- + +### W8:文件 I/O 与序列化 + +**本周任务:** +- 将数据写入CSV文件 +- 将数据写入JSON文件 +- 支持从文件读取数据 +- 处理文件编码问题 + +**所学知识:** +- FileWriter和BufferedWriter的使用 +- JSON数据格式的序列化 +- CSV文件格式规范 +- UTF-8编码处理 + +**遇到的困难:** +- CSV文件中包含逗号导致列错位 +- JSON序列化时日期格式错误 +- 文件路径处理复杂 + +**如何解决的:** +- 使用双引号包裹含逗号的字段 +- 使用SimpleDateFormat格式化日期 +- 封装DataStorage工具类统一处理文件操作 + +**AI是如何帮助的:** +- AI生成了CSV和JSON的读写工具类 +- AI处理了边界情况,如特殊字符转义 +- AI建议了文件路径的最佳实践 + +--- + +## 三、项目结构 + +### 3.1 最终包结构 + +``` +project/ +├── src/project/ +│ ├── bean/ # Model 数据模型层 +│ │ ├── Movie.java # 电影数据实体 +│ │ ├── Job.java # 招聘数据实体 +│ │ └── Poem.java # 诗词数据实体 +│ │ +│ ├── view/ # View 视图层 +│ │ └── ConsoleView.java # 控制台UI交互 +│ │ +│ ├── controller/ # Controller 控制器层 +│ │ └── CrawlerController.java # 命令调度中心 +│ │ +│ ├── command/ # Command 命令模式 +│ │ ├── Command.java # 命令接口 +│ │ ├── CrawlCommand.java # 爬取命令 +│ │ ├── ListCommand.java # 列表命令 +│ │ ├── AnalyzeCommand.java # 分析命令 +│ │ ├── SaveCommand.java # 保存命令 +│ │ ├── HelpCommand.java # 帮助命令 +│ │ ├── HistoryCommand.java # 历史记录命令 +│ │ └── ExitCommand.java # 退出命令 +│ │ +│ ├── core/ # 核心接口 +│ │ ├── DataEntity.java # 数据实体接口 +│ │ ├── WebCrawler.java # 爬虫接口 +│ │ └── AbstractWebCrawler.java # 爬虫抽象类 +│ │ +│ ├── strategy/ # Strategy 策略模式 +│ │ ├── CrawlStrategy.java # 爬取策略接口 +│ │ ├── CrawlerContext.java # 策略上下文 +│ │ ├── MovieCrawlStrategy.java # 电影爬取策略 +│ │ ├── JobCrawlStrategy.java # 招聘爬取策略 +│ │ └── PoemCrawlStrategy.java # 诗词爬取策略 +│ │ +│ ├── crawler/ # 爬虫实现 +│ │ ├── MovieCrawler.java +│ │ ├── JobCrawler.java +│ │ └── PoemCrawler.java +│ │ +│ ├── analysis/ # 数据分析 +│ │ ├── MovieAnalyzer.java +│ │ ├── JobAnalyzer.java +│ │ └── PoemAnalyzer.java +│ │ +│ ├── utils/ # 工具类 +│ │ ├── HttpUtils.java +│ │ ├── DataCleaner.java +│ │ └── DataStorage.java +│ │ +│ ├── exception/ # 异常类 +│ │ ├── CrawlerException.java +│ │ └── ParseException.java +│ │ +│ ├── Main.java # 主入口(CLI交互) +│ └── AutoTest.java # 自动测试 +│ +├── bin/ # 编译输出目录 +└── output/ # 数据输出目录 +``` + +### 3.2 MVC架构说明 + +| 层 | 包/类 | 职责 | 只做什么 | +|---|-------|------|----------| +| **Model** | `bean/*` | 数据模型 | 存储数据、提供getter/setter | +| **View** | `view/ConsoleView` | 用户界面 | 打印菜单、读取输入、展示结果 | +| **Controller** | `controller/*` | 业务调度 | 接收命令、调用Command执行 | +| **Command** | `command/*` | 命令执行 | 实现具体业务逻辑 | + +### 3.3 设计模式 + +#### 3.3.1 Command模式 + +| 组件 | 职责 | +|------|------| +| `Command` 接口 | 定义命令的执行接口 | +| `CrawlCommand` | 爬取数据命令 | +| `ListCommand` | 显示列表命令 | +| `AnalyzeCommand` | 分析数据命令 | +| `SaveCommand` | 保存数据命令 | + +#### 3.3.2 Strategy模式 + +| 组件 | 职责 | +|------|------| +| `CrawlStrategy` 接口 | 定义爬取策略接口 | +| `CrawlerContext` | 策略上下文,管理所有策略 | +| `MovieCrawlStrategy` | 电影爬取策略 | +| `JobCrawlStrategy` | 招聘爬取策略 | +| `PoemCrawlStrategy` | 诗词爬取策略 | + +**策略模式类图:** + +```mermaid +classDiagram + class CrawlStrategy~T extends DataEntity~ { + <> + +getType() String + +getTypeName() String + +crawl(int pages) List~T~ + } + + class CrawlerContext { + -Map~String, CrawlStrategy~~ strategies + +registerStrategy(CrawlStrategy) void + +getStrategy(String) CrawlStrategy~T~ + +hasStrategy(String) boolean + } + + class MovieCrawlStrategy { + -MovieCrawler crawler + +getType() String + +getTypeName() String + +crawl(int pages) List~Movie~ + } + + class JobCrawlStrategy { + -JobCrawler crawler + +getType() String + +getTypeName() String + +crawl(int pages) List~Job~ + } + + class PoemCrawlStrategy { + -PoemCrawler crawler + +getType() String + +getTypeName() String + +crawl(int pages) List~Poem~ + } + + CrawlStrategy <|.. MovieCrawlStrategy + CrawlStrategy <|.. JobCrawlStrategy + CrawlStrategy <|.. PoemCrawlStrategy + CrawlerContext --> CrawlStrategy : uses +``` + +#### 3.3.4 异常体系说明 +**类层次结构** +``` +java.lang.Exception + │ + └── CrawlerException (爬虫异常) + │ + └── ParseException (解析异常) +``` +**异常链路传播** +``` +┌─────────────────────────────────────────────────────────────┐ +│ 用户输入 │ +│ "crawl movie" │ +└───────────────────────────┬─────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ CrawlCommand │ +│ .execute() │ +│ throws CrawlerException │ +└───────────────────────────┬─────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ MovieCrawlStrategy.crawl() │ +│ throws CrawlerException │ +└───────────────────────────┬─────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ MovieCrawler (extends AbstractWebCrawler) │ +│ .crawl() │ +│ throws CrawlerException │ +└───────────────────────────┬─────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ AbstractWebCrawler │ +│ .crawlSingleThread() │ +│ throws CrawlerException │ +└───────────────────────────┬─────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ HttpUtils │ +│ .fetchHtml() │ +│ throws CrawlerException │ +│ │ +│ 可能的异常: │ +│ - HTTP 404/500/403 │ +│ - 连接超时 │ +│ - URL无效 │ +│ - 网络不可达 │ +└─────────────────────────────────────────────────────────────┘ +``` + +### 3.4 完整类图 + +```mermaid +classDiagram + class ConsoleView { + <> + +readCommand() String + +printWelcome() void + +printHelp() void + +printMovieList(List) void + +printJobList(List) void + +printPoemList(List) void + +printSuccess(String) void + +printError(String) void + } + + class CrawlerController { + <> + -Map~String, Command~ commands + -Map~String, String~ aliases + -List~String~ history + +execute(String) void + +getMovies() List~Movie~ + +getJobs() List~Job~ + +getPoems() List~Poem~ + +isExitCommand(String) boolean + } + + class Command { + <> + +execute(String[]) void + +getName() String + +getDescription() String + } + + class CrawlCommand { + +execute(String[]) void + } + + class ListCommand { + +execute(String[]) void + } + + class AnalyzeCommand { + +execute(String[]) void + } + + class SaveCommand { + +execute(String[]) void + } + + class HelpCommand { + +execute(String[]) void + } + + class HistoryCommand { + +execute(String[]) void + } + + class ExitCommand { + +execute(String[]) void + } + + class MovieCrawler { + +parsePage(String, int) List~Movie~ + } + + class JobCrawler { + +parsePage(String, int) List~Job~ + } + + class PoemCrawler { + +parsePage(String, int) List~Poem~ + } + + ConsoleView --> CrawlerController : uses + CrawlerController --> Command : uses + Command <|.. CrawlCommand + Command <|.. ListCommand + Command <|.. AnalyzeCommand + Command <|.. SaveCommand + Command <|.. HelpCommand + Command <|.. HistoryCommand + Command <|.. ExitCommand + CrawlCommand --> MovieCrawler : creates + CrawlCommand --> JobCrawler : creates + CrawlCommand --> PoemCrawler : creates +``` + +--- + +## 四、成果展示 + +### 4.1 运行截图 +**编译** +![](./images/2.png) +**爬取** +![](./images/3.png) +**查看** +![](./images/4.png) +![](./images/5.png) +![](./images/6.png) +**分析** +![](./images/7.png) +**保存** +![](./images/8.png) +**查看历史命令和退出** +![](./images/1.png) +### 4.2 功能测试 + +| 功能 | 测试结果 | 备注 | +|------|----------|------| +| 豆瓣电影爬虫 | ✅ 通过 | 成功爬取75部电影数据 | +| 前程无忧招聘爬虫 | ✅ 通过 | 成功爬取20条招聘信息 | +| 古诗词爬虫 | ✅ 通过 | 成功爬取20首诗词 | +| MVC架构 | ✅ 通过 | View/Controller/Command完全分离 | +| CLI交互 | ✅ 通过 | 支持命令输入和快捷键 | +| Command模式 | ✅ 通过 | 7个独立命令类 | +| 策略模式 | ✅ 通过 | 实现爬虫策略的动态切换 | +| 异常体系 | ✅ 通过 | 实现爬虫相关错误和数据解析错误| +| 数据清洗 | ✅ 通过 | 去除HTML标签、空格、特殊字符 | +| CSV文件保存 | ✅ 通过 | 生成movies.csv, jobs.csv, poems.csv | +| JSON文件保存 | ✅ 通过 | 生成movies.json, jobs.json, poems.json | +| 数据分析 | ✅ 通过 | Stream API统计分析 | +| 命令历史 | ✅ 通过 | 记录用户输入的命令 | +| 命令别名 | ✅ 通过 | c/l/a/s/h等快捷键 | + +--- + +## 五、总结 + +### 5.1 项目完成情况 + +本项目成功实现了一个完整的多源数据爬取与分析系统,主要完成内容包括: + +1. **爬虫模块**:实现了三个网站的爬虫(豆瓣电影、前程无忧、古诗词网),支持分页爬取 +2. **数据模型**:设计了Movie、Job、Poem三个实体类,实现DataEntity接口统一处理 +3. **MVC架构**:实现了真正的三层分离 + - Model层:bean包 - 数据存储 + - View层:view包 - UI交互 + - Controller层:controller包 - 业务调度 +4. **Command模式**:7个独立命令类实现具体业务逻辑 +5. **策略模式**:通过CrawlStrategy接口和CrawlerContext实现爬虫策略的动态切换 +6. **CLI交互**:支持命令输入、快捷键、命令历史 +7. **数据存储**:支持CSV和JSON两种格式的文件输出 +8. **数据分析**:使用Stream API进行数据统计 + +### 5.2 技术亮点 + +- **真正的MVC分离**:View层不包含任何业务逻辑,Controller只负责调度,Command实现具体业务 +- **Command模式**:每个命令封装成独立类,便于扩展和维护 +- **策略模式**:通过CrawlStrategy接口和CrawlerContext实现爬虫策略的动态切换,支持运行时更换爬取算法 +- **命令别名**:支持快捷键(c/l/a/s/h),提升用户体验 +- **命令历史**:记录用户输入的所有命令 +- **泛型编程**:通过泛型实现爬虫的类型安全 +- **Stream API**:简化数据统计分析代码 + +### 5.3 后续改进方向 + +1. **引入Jsoup库**:使用专业的HTML解析库替代正则表达式 +2. **数据库持久化**:添加MySQL/SQLite支持,实现数据持久化存储 +3. **图表生成**:使用JFreeChart或XChart生成可视化图表 +4. **分布式爬取**:支持分布式爬虫架构 +5. **API接口**:提供RESTful API接口供外部系统调用 + +### 5.4 学习收获 + +通过本次项目开发,我掌握了以下技能: + +- Java面向对象编程的核心概念(封装、继承、多态) +- 设计模式的实际应用(MVC模式、Command模式、策略模式) +- MVC架构的真正含义和实践 +- CLI界面设计和用户交互 +- 网络编程和HTTP请求处理 +- 数据清洗和格式化处理 +- 文件I/O和数据序列化 +- 异常处理和错误恢复 +--- \ No newline at end of file diff --git a/project/202401070104-张思渊-期末实验报告docx.docx b/project/202401070104-张思渊-期末实验报告docx.docx new file mode 100644 index 0000000..bf713a1 Binary files /dev/null and b/project/202401070104-张思渊-期末实验报告docx.docx differ diff --git a/project/202401070104-张思渊-期末实验报告docx.pdf b/project/202401070104-张思渊-期末实验报告docx.pdf new file mode 100644 index 0000000..96cfcb1 Binary files /dev/null and b/project/202401070104-张思渊-期末实验报告docx.pdf differ diff --git a/project/bin/Main.class b/project/bin/Main.class new file mode 100644 index 0000000..87b1623 Binary files /dev/null and b/project/bin/Main.class differ diff --git a/project/bin/com/example/datacollect/CrawlTest.class b/project/bin/com/example/datacollect/CrawlTest.class new file mode 100644 index 0000000..0edc527 Binary files /dev/null and b/project/bin/com/example/datacollect/CrawlTest.class differ diff --git a/project/bin/com/example/datacollect/Main.class b/project/bin/com/example/datacollect/Main.class new file mode 100644 index 0000000..ec52eb2 Binary files /dev/null and b/project/bin/com/example/datacollect/Main.class differ diff --git a/project/bin/com/example/datacollect/TestHtml.class b/project/bin/com/example/datacollect/TestHtml.class new file mode 100644 index 0000000..1abb91e Binary files /dev/null and b/project/bin/com/example/datacollect/TestHtml.class differ diff --git a/project/bin/com/example/datacollect/command/AnalyzeCommand.class b/project/bin/com/example/datacollect/command/AnalyzeCommand.class new file mode 100644 index 0000000..24da5ba Binary files /dev/null and b/project/bin/com/example/datacollect/command/AnalyzeCommand.class differ diff --git a/project/bin/com/example/datacollect/command/Command.class b/project/bin/com/example/datacollect/command/Command.class new file mode 100644 index 0000000..e019865 Binary files /dev/null and b/project/bin/com/example/datacollect/command/Command.class differ diff --git a/project/bin/com/example/datacollect/command/CrawlCommand.class b/project/bin/com/example/datacollect/command/CrawlCommand.class new file mode 100644 index 0000000..25de9a6 Binary files /dev/null and b/project/bin/com/example/datacollect/command/CrawlCommand.class differ diff --git a/project/bin/com/example/datacollect/command/ExitCommand.class b/project/bin/com/example/datacollect/command/ExitCommand.class new file mode 100644 index 0000000..feec2cf Binary files /dev/null and b/project/bin/com/example/datacollect/command/ExitCommand.class differ diff --git a/project/bin/com/example/datacollect/command/ExportCommand.class b/project/bin/com/example/datacollect/command/ExportCommand.class new file mode 100644 index 0000000..8becb59 Binary files /dev/null and b/project/bin/com/example/datacollect/command/ExportCommand.class differ diff --git a/project/bin/com/example/datacollect/command/HelpCommand.class b/project/bin/com/example/datacollect/command/HelpCommand.class new file mode 100644 index 0000000..160cd41 Binary files /dev/null and b/project/bin/com/example/datacollect/command/HelpCommand.class differ diff --git a/project/bin/com/example/datacollect/command/HistoryCommand.class b/project/bin/com/example/datacollect/command/HistoryCommand.class new file mode 100644 index 0000000..23d59a9 Binary files /dev/null and b/project/bin/com/example/datacollect/command/HistoryCommand.class differ diff --git a/project/bin/com/example/datacollect/command/ListCommand.class b/project/bin/com/example/datacollect/command/ListCommand.class new file mode 100644 index 0000000..ca412d5 Binary files /dev/null and b/project/bin/com/example/datacollect/command/ListCommand.class differ diff --git a/project/bin/com/example/datacollect/controller/CrawlerController.class b/project/bin/com/example/datacollect/controller/CrawlerController.class new file mode 100644 index 0000000..012b7fb Binary files /dev/null and b/project/bin/com/example/datacollect/controller/CrawlerController.class differ diff --git a/project/bin/com/example/datacollect/exception/CrawlerException.class b/project/bin/com/example/datacollect/exception/CrawlerException.class new file mode 100644 index 0000000..91ee25e Binary files /dev/null and b/project/bin/com/example/datacollect/exception/CrawlerException.class differ diff --git a/project/bin/com/example/datacollect/exception/NetworkException.class b/project/bin/com/example/datacollect/exception/NetworkException.class new file mode 100644 index 0000000..f281a51 Binary files /dev/null and b/project/bin/com/example/datacollect/exception/NetworkException.class differ diff --git a/project/bin/com/example/datacollect/exception/ParseException.class b/project/bin/com/example/datacollect/exception/ParseException.class new file mode 100644 index 0000000..dafa837 Binary files /dev/null and b/project/bin/com/example/datacollect/exception/ParseException.class differ diff --git a/project/bin/com/example/datacollect/model/Article.class b/project/bin/com/example/datacollect/model/Article.class new file mode 100644 index 0000000..23416ef Binary files /dev/null and b/project/bin/com/example/datacollect/model/Article.class differ diff --git a/project/bin/com/example/datacollect/repository/ArticleRepository.class b/project/bin/com/example/datacollect/repository/ArticleRepository.class new file mode 100644 index 0000000..3c8bb34 Binary files /dev/null and b/project/bin/com/example/datacollect/repository/ArticleRepository.class differ diff --git a/project/bin/com/example/datacollect/strategy/CrawlStrategy.class b/project/bin/com/example/datacollect/strategy/CrawlStrategy.class new file mode 100644 index 0000000..6e8358a Binary files /dev/null and b/project/bin/com/example/datacollect/strategy/CrawlStrategy.class differ diff --git a/project/bin/com/example/datacollect/strategy/DoubanBookStrategy.class b/project/bin/com/example/datacollect/strategy/DoubanBookStrategy.class new file mode 100644 index 0000000..35f1462 Binary files /dev/null and b/project/bin/com/example/datacollect/strategy/DoubanBookStrategy.class differ diff --git a/project/bin/com/example/datacollect/strategy/DoubanMovieStrategy.class b/project/bin/com/example/datacollect/strategy/DoubanMovieStrategy.class new file mode 100644 index 0000000..0aae2b5 Binary files /dev/null and b/project/bin/com/example/datacollect/strategy/DoubanMovieStrategy.class differ diff --git a/project/bin/com/example/datacollect/strategy/PoetryStrategy.class b/project/bin/com/example/datacollect/strategy/PoetryStrategy.class new file mode 100644 index 0000000..d0759f2 Binary files /dev/null and b/project/bin/com/example/datacollect/strategy/PoetryStrategy.class differ diff --git a/project/bin/com/example/datacollect/strategy/StrategyFactory.class b/project/bin/com/example/datacollect/strategy/StrategyFactory.class new file mode 100644 index 0000000..ce871b5 Binary files /dev/null and b/project/bin/com/example/datacollect/strategy/StrategyFactory.class differ diff --git a/project/bin/com/example/datacollect/utils/DataCleaner.class b/project/bin/com/example/datacollect/utils/DataCleaner.class new file mode 100644 index 0000000..b1bdc6d Binary files /dev/null and b/project/bin/com/example/datacollect/utils/DataCleaner.class differ diff --git a/project/bin/com/example/datacollect/utils/HttpUtils.class b/project/bin/com/example/datacollect/utils/HttpUtils.class new file mode 100644 index 0000000..9ffb0da Binary files /dev/null and b/project/bin/com/example/datacollect/utils/HttpUtils.class differ diff --git a/project/bin/com/example/datacollect/view/ConsoleView.class b/project/bin/com/example/datacollect/view/ConsoleView.class new file mode 100644 index 0000000..b3a5bd7 Binary files /dev/null and b/project/bin/com/example/datacollect/view/ConsoleView.class differ diff --git a/project/bin/project/AutoTest.class b/project/bin/project/AutoTest.class new file mode 100644 index 0000000..f61df80 Binary files /dev/null and b/project/bin/project/AutoTest.class differ diff --git a/project/bin/project/Main.class b/project/bin/project/Main.class new file mode 100644 index 0000000..5c96fdb Binary files /dev/null and b/project/bin/project/Main.class differ diff --git a/project/bin/project/analysis/BookAnalyzer.class b/project/bin/project/analysis/BookAnalyzer.class new file mode 100644 index 0000000..bc524b9 Binary files /dev/null and b/project/bin/project/analysis/BookAnalyzer.class differ diff --git a/project/bin/project/analysis/JobAnalyzer.class b/project/bin/project/analysis/JobAnalyzer.class new file mode 100644 index 0000000..582c485 Binary files /dev/null and b/project/bin/project/analysis/JobAnalyzer.class differ diff --git a/project/bin/project/analysis/MovieAnalyzer.class b/project/bin/project/analysis/MovieAnalyzer.class new file mode 100644 index 0000000..784783b Binary files /dev/null and b/project/bin/project/analysis/MovieAnalyzer.class differ diff --git a/project/bin/project/analysis/PoemAnalyzer.class b/project/bin/project/analysis/PoemAnalyzer.class new file mode 100644 index 0000000..96ab1a2 Binary files /dev/null and b/project/bin/project/analysis/PoemAnalyzer.class differ diff --git a/project/bin/project/bean/Book.class b/project/bin/project/bean/Book.class new file mode 100644 index 0000000..980a774 Binary files /dev/null and b/project/bin/project/bean/Book.class differ diff --git a/project/bin/project/bean/Job.class b/project/bin/project/bean/Job.class new file mode 100644 index 0000000..e729ded Binary files /dev/null and b/project/bin/project/bean/Job.class differ diff --git a/project/bin/project/bean/Movie.class b/project/bin/project/bean/Movie.class new file mode 100644 index 0000000..b9b9bbd Binary files /dev/null and b/project/bin/project/bean/Movie.class differ diff --git a/project/bin/project/bean/Poem.class b/project/bin/project/bean/Poem.class new file mode 100644 index 0000000..ebc0468 Binary files /dev/null and b/project/bin/project/bean/Poem.class differ diff --git a/project/bin/project/bean/Quote.class b/project/bin/project/bean/Quote.class new file mode 100644 index 0000000..6bb257b Binary files /dev/null and b/project/bin/project/bean/Quote.class differ diff --git a/project/bin/project/command/AnalyzeCommand.class b/project/bin/project/command/AnalyzeCommand.class new file mode 100644 index 0000000..2385850 Binary files /dev/null and b/project/bin/project/command/AnalyzeCommand.class differ diff --git a/project/bin/project/command/Command.class b/project/bin/project/command/Command.class new file mode 100644 index 0000000..e85473b Binary files /dev/null and b/project/bin/project/command/Command.class differ diff --git a/project/bin/project/command/CrawlCommand.class b/project/bin/project/command/CrawlCommand.class new file mode 100644 index 0000000..d2ce277 Binary files /dev/null and b/project/bin/project/command/CrawlCommand.class differ diff --git a/project/bin/project/command/ExitCommand.class b/project/bin/project/command/ExitCommand.class new file mode 100644 index 0000000..ab14e1d Binary files /dev/null and b/project/bin/project/command/ExitCommand.class differ diff --git a/project/bin/project/command/HelpCommand.class b/project/bin/project/command/HelpCommand.class new file mode 100644 index 0000000..d56a443 Binary files /dev/null and b/project/bin/project/command/HelpCommand.class differ diff --git a/project/bin/project/command/HistoryCommand.class b/project/bin/project/command/HistoryCommand.class new file mode 100644 index 0000000..f22443d Binary files /dev/null and b/project/bin/project/command/HistoryCommand.class differ diff --git a/project/bin/project/command/ListCommand.class b/project/bin/project/command/ListCommand.class new file mode 100644 index 0000000..ae944ff Binary files /dev/null and b/project/bin/project/command/ListCommand.class differ diff --git a/project/bin/project/command/SaveCommand.class b/project/bin/project/command/SaveCommand.class new file mode 100644 index 0000000..ba02189 Binary files /dev/null and b/project/bin/project/command/SaveCommand.class differ diff --git a/project/bin/project/controller/CrawlerController.class b/project/bin/project/controller/CrawlerController.class new file mode 100644 index 0000000..37307c7 Binary files /dev/null and b/project/bin/project/controller/CrawlerController.class differ diff --git a/project/bin/project/core/AbstractWebCrawler.class b/project/bin/project/core/AbstractWebCrawler.class new file mode 100644 index 0000000..1978d7f Binary files /dev/null and b/project/bin/project/core/AbstractWebCrawler.class differ diff --git a/project/bin/project/core/DataEntity.class b/project/bin/project/core/DataEntity.class new file mode 100644 index 0000000..4f77132 Binary files /dev/null and b/project/bin/project/core/DataEntity.class differ diff --git a/project/bin/project/core/WebCrawler.class b/project/bin/project/core/WebCrawler.class new file mode 100644 index 0000000..14e7e95 Binary files /dev/null and b/project/bin/project/core/WebCrawler.class differ diff --git a/project/bin/project/crawler/BookCrawler.class b/project/bin/project/crawler/BookCrawler.class new file mode 100644 index 0000000..affd9ec Binary files /dev/null and b/project/bin/project/crawler/BookCrawler.class differ diff --git a/project/bin/project/crawler/JobCrawler.class b/project/bin/project/crawler/JobCrawler.class new file mode 100644 index 0000000..93f7e9f Binary files /dev/null and b/project/bin/project/crawler/JobCrawler.class differ diff --git a/project/bin/project/crawler/MovieCrawler.class b/project/bin/project/crawler/MovieCrawler.class new file mode 100644 index 0000000..18216f4 Binary files /dev/null and b/project/bin/project/crawler/MovieCrawler.class differ diff --git a/project/bin/project/crawler/PoemCrawler.class b/project/bin/project/crawler/PoemCrawler.class new file mode 100644 index 0000000..693213d Binary files /dev/null and b/project/bin/project/crawler/PoemCrawler.class differ diff --git a/project/bin/project/display/ResultDisplay.class b/project/bin/project/display/ResultDisplay.class new file mode 100644 index 0000000..7693a24 Binary files /dev/null and b/project/bin/project/display/ResultDisplay.class differ diff --git a/project/bin/project/exception/CrawlerException.class b/project/bin/project/exception/CrawlerException.class new file mode 100644 index 0000000..9444454 Binary files /dev/null and b/project/bin/project/exception/CrawlerException.class differ diff --git a/project/bin/project/exception/ParseException.class b/project/bin/project/exception/ParseException.class new file mode 100644 index 0000000..91a981d Binary files /dev/null and b/project/bin/project/exception/ParseException.class differ diff --git a/project/bin/project/strategy/CrawlStrategy.class b/project/bin/project/strategy/CrawlStrategy.class new file mode 100644 index 0000000..cc1b6f3 Binary files /dev/null and b/project/bin/project/strategy/CrawlStrategy.class differ diff --git a/project/bin/project/strategy/CrawlerContext.class b/project/bin/project/strategy/CrawlerContext.class new file mode 100644 index 0000000..6ab7844 Binary files /dev/null and b/project/bin/project/strategy/CrawlerContext.class differ diff --git a/project/bin/project/strategy/JobCrawlStrategy.class b/project/bin/project/strategy/JobCrawlStrategy.class new file mode 100644 index 0000000..07836ed Binary files /dev/null and b/project/bin/project/strategy/JobCrawlStrategy.class differ diff --git a/project/bin/project/strategy/MovieCrawlStrategy.class b/project/bin/project/strategy/MovieCrawlStrategy.class new file mode 100644 index 0000000..6d0d5f0 Binary files /dev/null and b/project/bin/project/strategy/MovieCrawlStrategy.class differ diff --git a/project/bin/project/strategy/PoemCrawlStrategy.class b/project/bin/project/strategy/PoemCrawlStrategy.class new file mode 100644 index 0000000..e0abcbf Binary files /dev/null and b/project/bin/project/strategy/PoemCrawlStrategy.class differ diff --git a/project/bin/project/utils/DataCleaner.class b/project/bin/project/utils/DataCleaner.class new file mode 100644 index 0000000..25ec3ec Binary files /dev/null and b/project/bin/project/utils/DataCleaner.class differ diff --git a/project/bin/project/utils/DataStorage.class b/project/bin/project/utils/DataStorage.class new file mode 100644 index 0000000..749441b Binary files /dev/null and b/project/bin/project/utils/DataStorage.class differ diff --git a/project/bin/project/utils/HttpUtils.class b/project/bin/project/utils/HttpUtils.class new file mode 100644 index 0000000..d870961 Binary files /dev/null and b/project/bin/project/utils/HttpUtils.class differ diff --git a/project/bin/project/view/ConsoleView.class b/project/bin/project/view/ConsoleView.class new file mode 100644 index 0000000..dd2517d Binary files /dev/null and b/project/bin/project/view/ConsoleView.class differ diff --git a/project/bin/project/visualization/ChartGenerator.class b/project/bin/project/visualization/ChartGenerator.class new file mode 100644 index 0000000..f5861dc Binary files /dev/null and b/project/bin/project/visualization/ChartGenerator.class differ diff --git a/project/images/1.png b/project/images/1.png new file mode 100644 index 0000000..b4d72ec Binary files /dev/null and b/project/images/1.png differ diff --git a/project/images/2.png b/project/images/2.png new file mode 100644 index 0000000..2e8955c Binary files /dev/null and b/project/images/2.png differ diff --git a/project/images/3.png b/project/images/3.png new file mode 100644 index 0000000..43c6270 Binary files /dev/null and b/project/images/3.png differ diff --git a/project/images/4.png b/project/images/4.png new file mode 100644 index 0000000..bad5164 Binary files /dev/null and b/project/images/4.png differ diff --git a/project/images/5.png b/project/images/5.png new file mode 100644 index 0000000..5845fbc Binary files /dev/null and b/project/images/5.png differ diff --git a/project/images/6.png b/project/images/6.png new file mode 100644 index 0000000..78dcc8a Binary files /dev/null and b/project/images/6.png differ diff --git a/project/images/7.png b/project/images/7.png new file mode 100644 index 0000000..e78ad9e Binary files /dev/null and b/project/images/7.png differ diff --git a/project/images/8.png b/project/images/8.png new file mode 100644 index 0000000..190c5b0 Binary files /dev/null and b/project/images/8.png differ diff --git a/project/output/charts/movie_rating_distribution.png b/project/output/charts/movie_rating_distribution.png new file mode 100644 index 0000000..d6253bd Binary files /dev/null and b/project/output/charts/movie_rating_distribution.png differ diff --git a/project/output/charts/movie_top_directors.png b/project/output/charts/movie_top_directors.png new file mode 100644 index 0000000..7e1c9f6 Binary files /dev/null and b/project/output/charts/movie_top_directors.png differ diff --git a/project/output/charts/rating_distribution.png b/project/output/charts/rating_distribution.png new file mode 100644 index 0000000..d6253bd Binary files /dev/null and b/project/output/charts/rating_distribution.png differ diff --git a/project/output/charts/rating_range_pie.png b/project/output/charts/rating_range_pie.png new file mode 100644 index 0000000..0a4bc94 Binary files /dev/null and b/project/output/charts/rating_range_pie.png differ diff --git a/project/output/charts/top_directors.png b/project/output/charts/top_directors.png new file mode 100644 index 0000000..7e1c9f6 Binary files /dev/null and b/project/output/charts/top_directors.png differ diff --git a/project/output/charts/year_rating_correlation.png b/project/output/charts/year_rating_correlation.png new file mode 100644 index 0000000..9abce53 Binary files /dev/null and b/project/output/charts/year_rating_correlation.png differ diff --git a/project/output/jobs.csv b/project/output/jobs.csv new file mode 100644 index 0000000..ac56cc9 --- /dev/null +++ b/project/output/jobs.csv @@ -0,0 +1,21 @@ +Title,Company,Location,Salary,Experience,Education +"Java开发工程师","阿里巴巴","杭州","15-25K","3-5年","本科" +"后端开发工程师","腾讯","深圳","20-35K","5-10年","本科" +"全栈开发工程师","字节跳动","北京","18-30K","3-5年","本科" +"高级Java工程师","美团","北京","25-40K","5-10年","本科" +"软件工程师","京东","北京","15-25K","1-3年","本科" +"技术经理","网易","杭州","30-50K","10年以上","硕士" +"架构师","华为","深圳","40-60K","10年以上","硕士" +"前端开发工程师","百度","北京","15-25K","3-5年","本科" +"大数据开发","小米","北京","20-35K","3-5年","本科" +"测试工程师","滴滴","北京","12-20K","1-3年","本科" +"Java开发工程师","阿里巴巴","杭州","15-25K","3-5年","本科" +"后端开发工程师","腾讯","深圳","20-35K","5-10年","本科" +"全栈开发工程师","字节跳动","北京","18-30K","3-5年","本科" +"高级Java工程师","美团","北京","25-40K","5-10年","本科" +"软件工程师","京东","北京","15-25K","1-3年","本科" +"技术经理","网易","杭州","30-50K","10年以上","硕士" +"架构师","华为","深圳","40-60K","10年以上","硕士" +"前端开发工程师","百度","北京","15-25K","3-5年","本科" +"大数据开发","小米","北京","20-35K","3-5年","本科" +"测试工程师","滴滴","北京","12-20K","1-3年","本科" diff --git a/project/output/jobs.json b/project/output/jobs.json new file mode 100644 index 0000000..d21b84f --- /dev/null +++ b/project/output/jobs.json @@ -0,0 +1,162 @@ +[ + { + "Title": "Java开发工程师", + "Company": "阿里巴巴", + "Location": "杭州", + "Salary": "15-25K", + "Experience": "3-5年", + "Education": "本科" + }, + { + "Title": "后端开发工程师", + "Company": "腾讯", + "Location": "深圳", + "Salary": "20-35K", + "Experience": "5-10年", + "Education": "本科" + }, + { + "Title": "全栈开发工程师", + "Company": "字节跳动", + "Location": "北京", + "Salary": "18-30K", + "Experience": "3-5年", + "Education": "本科" + }, + { + "Title": "高级Java工程师", + "Company": "美团", + "Location": "北京", + "Salary": "25-40K", + "Experience": "5-10年", + "Education": "本科" + }, + { + "Title": "软件工程师", + "Company": "京东", + "Location": "北京", + "Salary": "15-25K", + "Experience": "1-3年", + "Education": "本科" + }, + { + "Title": "技术经理", + "Company": "网易", + "Location": "杭州", + "Salary": "30-50K", + "Experience": "10年以上", + "Education": "硕士" + }, + { + "Title": "架构师", + "Company": "华为", + "Location": "深圳", + "Salary": "40-60K", + "Experience": "10年以上", + "Education": "硕士" + }, + { + "Title": "前端开发工程师", + "Company": "百度", + "Location": "北京", + "Salary": "15-25K", + "Experience": "3-5年", + "Education": "本科" + }, + { + "Title": "大数据开发", + "Company": "小米", + "Location": "北京", + "Salary": "20-35K", + "Experience": "3-5年", + "Education": "本科" + }, + { + "Title": "测试工程师", + "Company": "滴滴", + "Location": "北京", + "Salary": "12-20K", + "Experience": "1-3年", + "Education": "本科" + }, + { + "Title": "Java开发工程师", + "Company": "阿里巴巴", + "Location": "杭州", + "Salary": "15-25K", + "Experience": "3-5年", + "Education": "本科" + }, + { + "Title": "后端开发工程师", + "Company": "腾讯", + "Location": "深圳", + "Salary": "20-35K", + "Experience": "5-10年", + "Education": "本科" + }, + { + "Title": "全栈开发工程师", + "Company": "字节跳动", + "Location": "北京", + "Salary": "18-30K", + "Experience": "3-5年", + "Education": "本科" + }, + { + "Title": "高级Java工程师", + "Company": "美团", + "Location": "北京", + "Salary": "25-40K", + "Experience": "5-10年", + "Education": "本科" + }, + { + "Title": "软件工程师", + "Company": "京东", + "Location": "北京", + "Salary": "15-25K", + "Experience": "1-3年", + "Education": "本科" + }, + { + "Title": "技术经理", + "Company": "网易", + "Location": "杭州", + "Salary": "30-50K", + "Experience": "10年以上", + "Education": "硕士" + }, + { + "Title": "架构师", + "Company": "华为", + "Location": "深圳", + "Salary": "40-60K", + "Experience": "10年以上", + "Education": "硕士" + }, + { + "Title": "前端开发工程师", + "Company": "百度", + "Location": "北京", + "Salary": "15-25K", + "Experience": "3-5年", + "Education": "本科" + }, + { + "Title": "大数据开发", + "Company": "小米", + "Location": "北京", + "Salary": "20-35K", + "Experience": "3-5年", + "Education": "本科" + }, + { + "Title": "测试工程师", + "Company": "滴滴", + "Location": "北京", + "Salary": "12-20K", + "Experience": "1-3年", + "Education": "本科" + } +] \ No newline at end of file diff --git a/project/output/movies.csv b/project/output/movies.csv new file mode 100644 index 0000000..d709f5e --- /dev/null +++ b/project/output/movies.csv @@ -0,0 +1,76 @@ +Title,Rating,Year,Director +"肖申克的救赎",9.7,1994,"弗兰克·德拉邦特" +"霸王别姬",9.6,1993,"陈凯歌" +"泰坦尼克号",9.5,1997,"詹姆斯·卡梅隆" +"阿甘正传",9.5,1994,"罗伯特·泽米吉斯" +"千与千寻",9.4,2001,"宫崎骏" +"美丽人生",9.5,1997,"罗伯托·贝尼尼" +"星际穿越",9.4,2014,"克里斯托弗·诺兰" +"这个杀手不太冷",9.4,1994,"吕克·贝松" +"盗梦空间",9.4,2010,"克里斯托弗·诺兰" +"楚门的世界",9.4,1998,"彼得·威尔" +"辛德勒的名单",9.5,1993,"史蒂文·斯皮尔伯格" +"忠犬八公的故事",9.4,2009,"莱塞·霍尔斯道姆" +"海上钢琴师",9.3,1998,"朱塞佩·托纳多雷" +"疯狂动物城",9.3,2016,"拜伦·霍华德" +"三傻大闹宝莱坞",9.2,2009,"拉库马·希拉尼" +"机器人总动员",9.3,2008,"安德鲁·斯坦顿" +"放牛班的春天",9.3,2004,"克里斯托夫·巴拉蒂" +"无间道",9.3,2002,"刘伟强" +"控方证人",9.6,1957,"比利·怀尔德" +"寻梦环游记",9.1,2017,"李·昂克里奇" +"大话西游之大圣娶亲",9.2,1995,"刘镇伟" +"熔炉",9.3,2011,"黄东赫" +"触不可及",9.3,2011,"奥利维·那卡什" +"教父",9.3,1972,"弗朗西斯·福特·科波拉" +"末代皇帝",9.3,1987,"贝纳尔多·贝托鲁奇" +"哈利·波特与魔法石",9.2,2001,"Chris" +"当幸福来敲门",9.1,2006,"加布里尔·穆奇诺" +"龙猫",9.2,1988,"宫崎骏" +"活着",9.3,1994,"张艺谋" +"怦然心动",9.1,2010,"罗伯·莱纳" +"蝙蝠侠:黑暗骑士",9.2,2008,"克里斯托弗·诺兰" +"指环王3:王者无敌",9.3,2003,"彼得·杰克逊" +"我不是药神",9.0,2018,"文牧野" +"乱世佳人",9.3,1939,"维克多·弗莱明" +"让子弹飞",9.0,2010,"姜文" +"飞屋环游记",9.1,2009,"彼特·道格特" +"哈尔的移动城堡",9.1,2004,"宫崎骏" +"十二怒汉",9.4,1957,"西德尼·吕美特" +"海蒂和爷爷",9.3,2015,"阿兰·葛斯彭纳" +"素媛",9.3,2013,"李濬益" +"猫鼠游戏",9.1,2002,"史蒂文·斯皮尔伯格" +"天空之城",9.2,1986,"宫崎骏" +"鬼子来了",9.3,2000,"姜文" +"摔跤吧!爸爸",9.0,2016,"涅提·蒂瓦里" +"少年派的奇幻漂流",9.1,2012,"李安" +"钢琴家",9.3,2002,"罗曼·波兰斯基" +"死亡诗社",9.2,1989,"彼得·威尔" +"指环王2:双塔奇兵",9.2,2002,"彼得·杰克逊" +"大话西游之月光宝盒",9.0,1995,"刘镇伟" +"绿皮书",8.9,2018,"彼得·法雷里" +"何以为家",9.1,2018,"娜丁·拉巴基" +"闻香识女人",9.1,1992,"马丁·布莱斯" +"大闹天宫",9.4,0,"万籁鸣" +"黑客帝国",9.1,1999,"安迪·沃卓斯基" +"指环王1:护戒使者",9.1,2001,"彼得·杰克逊" +"罗马假日",9.1,1953,"威廉·惠勒" +"教父2",9.3,1974,"弗朗西斯·福特·科波拉" +"狮子王",9.1,1994,"Roger" +"天堂电影院",9.2,1988,"朱塞佩·托纳多雷" +"饮食男女",9.2,1994,"李安" +"辩护人",9.2,2013,"杨宇硕" +"本杰明·巴顿奇事",9.0,2008,"大卫·芬奇" +"搏击俱乐部",9.0,1999,"大卫·芬奇" +"美丽心灵",9.1,2001,"朗·霍华德" +"穿条纹睡衣的男孩",9.2,2008,"马克·赫尔曼" +"哈利·波特与死亡圣器(下)",9.0,2011,"大卫·叶茨" +"情书",8.9,1995,"岩井俊二" +"两杆大烟枪",9.1,1998,"盖·里奇" +"窃听风暴",9.2,2006,"弗洛里安·亨克尔·冯·多纳斯马尔克" +"功夫",8.9,2004,"周星驰" +"音乐之声",9.1,1965,"罗伯特·怀斯" +"哈利·波特与阿兹卡班的囚徒",9.0,2004,"阿方索·卡隆" +"阿凡达",8.8,2009,"詹姆斯·卡梅隆" +"西西里的美丽传说",8.9,2000,"朱塞佩·托纳多雷" +"看不见的客人",8.8,2016,"奥里奥尔·保罗" diff --git a/project/output/movies.json b/project/output/movies.json new file mode 100644 index 0000000..7b39e2f --- /dev/null +++ b/project/output/movies.json @@ -0,0 +1,452 @@ +[ + { + "Title": "肖申克的救赎", + "Rating": "9.7", + "Year": "1994", + "Director": "弗兰克·德拉邦特" + }, + { + "Title": "霸王别姬", + "Rating": "9.6", + "Year": "1993", + "Director": "陈凯歌" + }, + { + "Title": "泰坦尼克号", + "Rating": "9.5", + "Year": "1997", + "Director": "詹姆斯·卡梅隆" + }, + { + "Title": "阿甘正传", + "Rating": "9.5", + "Year": "1994", + "Director": "罗伯特·泽米吉斯" + }, + { + "Title": "千与千寻", + "Rating": "9.4", + "Year": "2001", + "Director": "宫崎骏" + }, + { + "Title": "美丽人生", + "Rating": "9.5", + "Year": "1997", + "Director": "罗伯托·贝尼尼" + }, + { + "Title": "星际穿越", + "Rating": "9.4", + "Year": "2014", + "Director": "克里斯托弗·诺兰" + }, + { + "Title": "这个杀手不太冷", + "Rating": "9.4", + "Year": "1994", + "Director": "吕克·贝松" + }, + { + "Title": "盗梦空间", + "Rating": "9.4", + "Year": "2010", + "Director": "克里斯托弗·诺兰" + }, + { + "Title": "楚门的世界", + "Rating": "9.4", + "Year": "1998", + "Director": "彼得·威尔" + }, + { + "Title": "辛德勒的名单", + "Rating": "9.5", + "Year": "1993", + "Director": "史蒂文·斯皮尔伯格" + }, + { + "Title": "忠犬八公的故事", + "Rating": "9.4", + "Year": "2009", + "Director": "莱塞·霍尔斯道姆" + }, + { + "Title": "海上钢琴师", + "Rating": "9.3", + "Year": "1998", + "Director": "朱塞佩·托纳多雷" + }, + { + "Title": "疯狂动物城", + "Rating": "9.3", + "Year": "2016", + "Director": "拜伦·霍华德" + }, + { + "Title": "三傻大闹宝莱坞", + "Rating": "9.2", + "Year": "2009", + "Director": "拉库马·希拉尼" + }, + { + "Title": "机器人总动员", + "Rating": "9.3", + "Year": "2008", + "Director": "安德鲁·斯坦顿" + }, + { + "Title": "放牛班的春天", + "Rating": "9.3", + "Year": "2004", + "Director": "克里斯托夫·巴拉蒂" + }, + { + "Title": "无间道", + "Rating": "9.3", + "Year": "2002", + "Director": "刘伟强" + }, + { + "Title": "控方证人", + "Rating": "9.6", + "Year": "1957", + "Director": "比利·怀尔德" + }, + { + "Title": "寻梦环游记", + "Rating": "9.1", + "Year": "2017", + "Director": "李·昂克里奇" + }, + { + "Title": "大话西游之大圣娶亲", + "Rating": "9.2", + "Year": "1995", + "Director": "刘镇伟" + }, + { + "Title": "熔炉", + "Rating": "9.3", + "Year": "2011", + "Director": "黄东赫" + }, + { + "Title": "触不可及", + "Rating": "9.3", + "Year": "2011", + "Director": "奥利维·那卡什" + }, + { + "Title": "教父", + "Rating": "9.3", + "Year": "1972", + "Director": "弗朗西斯·福特·科波拉" + }, + { + "Title": "末代皇帝", + "Rating": "9.3", + "Year": "1987", + "Director": "贝纳尔多·贝托鲁奇" + }, + { + "Title": "哈利·波特与魔法石", + "Rating": "9.2", + "Year": "2001", + "Director": "Chris" + }, + { + "Title": "当幸福来敲门", + "Rating": "9.1", + "Year": "2006", + "Director": "加布里尔·穆奇诺" + }, + { + "Title": "龙猫", + "Rating": "9.2", + "Year": "1988", + "Director": "宫崎骏" + }, + { + "Title": "活着", + "Rating": "9.3", + "Year": "1994", + "Director": "张艺谋" + }, + { + "Title": "怦然心动", + "Rating": "9.1", + "Year": "2010", + "Director": "罗伯·莱纳" + }, + { + "Title": "蝙蝠侠:黑暗骑士", + "Rating": "9.2", + "Year": "2008", + "Director": "克里斯托弗·诺兰" + }, + { + "Title": "指环王3:王者无敌", + "Rating": "9.3", + "Year": "2003", + "Director": "彼得·杰克逊" + }, + { + "Title": "我不是药神", + "Rating": "9.0", + "Year": "2018", + "Director": "文牧野" + }, + { + "Title": "乱世佳人", + "Rating": "9.3", + "Year": "1939", + "Director": "维克多·弗莱明" + }, + { + "Title": "让子弹飞", + "Rating": "9.0", + "Year": "2010", + "Director": "姜文" + }, + { + "Title": "飞屋环游记", + "Rating": "9.1", + "Year": "2009", + "Director": "彼特·道格特" + }, + { + "Title": "哈尔的移动城堡", + "Rating": "9.1", + "Year": "2004", + "Director": "宫崎骏" + }, + { + "Title": "十二怒汉", + "Rating": "9.4", + "Year": "1957", + "Director": "西德尼·吕美特" + }, + { + "Title": "海蒂和爷爷", + "Rating": "9.3", + "Year": "2015", + "Director": "阿兰·葛斯彭纳" + }, + { + "Title": "素媛", + "Rating": "9.3", + "Year": "2013", + "Director": "李濬益" + }, + { + "Title": "猫鼠游戏", + "Rating": "9.1", + "Year": "2002", + "Director": "史蒂文·斯皮尔伯格" + }, + { + "Title": "天空之城", + "Rating": "9.2", + "Year": "1986", + "Director": "宫崎骏" + }, + { + "Title": "鬼子来了", + "Rating": "9.3", + "Year": "2000", + "Director": "姜文" + }, + { + "Title": "摔跤吧!爸爸", + "Rating": "9.0", + "Year": "2016", + "Director": "涅提·蒂瓦里" + }, + { + "Title": "少年派的奇幻漂流", + "Rating": "9.1", + "Year": "2012", + "Director": "李安" + }, + { + "Title": "钢琴家", + "Rating": "9.3", + "Year": "2002", + "Director": "罗曼·波兰斯基" + }, + { + "Title": "死亡诗社", + "Rating": "9.2", + "Year": "1989", + "Director": "彼得·威尔" + }, + { + "Title": "指环王2:双塔奇兵", + "Rating": "9.2", + "Year": "2002", + "Director": "彼得·杰克逊" + }, + { + "Title": "大话西游之月光宝盒", + "Rating": "9.0", + "Year": "1995", + "Director": "刘镇伟" + }, + { + "Title": "绿皮书", + "Rating": "8.9", + "Year": "2018", + "Director": "彼得·法雷里" + }, + { + "Title": "何以为家", + "Rating": "9.1", + "Year": "2018", + "Director": "娜丁·拉巴基" + }, + { + "Title": "闻香识女人", + "Rating": "9.1", + "Year": "1992", + "Director": "马丁·布莱斯" + }, + { + "Title": "大闹天宫", + "Rating": "9.4", + "Year": "0", + "Director": "万籁鸣" + }, + { + "Title": "黑客帝国", + "Rating": "9.1", + "Year": "1999", + "Director": "安迪·沃卓斯基" + }, + { + "Title": "指环王1:护戒使者", + "Rating": "9.1", + "Year": "2001", + "Director": "彼得·杰克逊" + }, + { + "Title": "罗马假日", + "Rating": "9.1", + "Year": "1953", + "Director": "威廉·惠勒" + }, + { + "Title": "教父2", + "Rating": "9.3", + "Year": "1974", + "Director": "弗朗西斯·福特·科波拉" + }, + { + "Title": "狮子王", + "Rating": "9.1", + "Year": "1994", + "Director": "Roger" + }, + { + "Title": "天堂电影院", + "Rating": "9.2", + "Year": "1988", + "Director": "朱塞佩·托纳多雷" + }, + { + "Title": "饮食男女", + "Rating": "9.2", + "Year": "1994", + "Director": "李安" + }, + { + "Title": "辩护人", + "Rating": "9.2", + "Year": "2013", + "Director": "杨宇硕" + }, + { + "Title": "本杰明·巴顿奇事", + "Rating": "9.0", + "Year": "2008", + "Director": "大卫·芬奇" + }, + { + "Title": "搏击俱乐部", + "Rating": "9.0", + "Year": "1999", + "Director": "大卫·芬奇" + }, + { + "Title": "美丽心灵", + "Rating": "9.1", + "Year": "2001", + "Director": "朗·霍华德" + }, + { + "Title": "穿条纹睡衣的男孩", + "Rating": "9.2", + "Year": "2008", + "Director": "马克·赫尔曼" + }, + { + "Title": "哈利·波特与死亡圣器(下)", + "Rating": "9.0", + "Year": "2011", + "Director": "大卫·叶茨" + }, + { + "Title": "情书", + "Rating": "8.9", + "Year": "1995", + "Director": "岩井俊二" + }, + { + "Title": "两杆大烟枪", + "Rating": "9.1", + "Year": "1998", + "Director": "盖·里奇" + }, + { + "Title": "窃听风暴", + "Rating": "9.2", + "Year": "2006", + "Director": "弗洛里安·亨克尔·冯·多纳斯马尔克" + }, + { + "Title": "功夫", + "Rating": "8.9", + "Year": "2004", + "Director": "周星驰" + }, + { + "Title": "音乐之声", + "Rating": "9.1", + "Year": "1965", + "Director": "罗伯特·怀斯" + }, + { + "Title": "哈利·波特与阿兹卡班的囚徒", + "Rating": "9.0", + "Year": "2004", + "Director": "阿方索·卡隆" + }, + { + "Title": "阿凡达", + "Rating": "8.8", + "Year": "2009", + "Director": "詹姆斯·卡梅隆" + }, + { + "Title": "西西里的美丽传说", + "Rating": "8.9", + "Year": "2000", + "Director": "朱塞佩·托纳多雷" + }, + { + "Title": "看不见的客人", + "Rating": "8.8", + "Year": "2016", + "Director": "奥里奥尔·保罗" + } +] \ No newline at end of file diff --git a/project/output/poems.csv b/project/output/poems.csv new file mode 100644 index 0000000..1b12554 --- /dev/null +++ b/project/output/poems.csv @@ -0,0 +1,81 @@ +Title,Author,Dynasty,Content +"静夜思","李白","唐代","床前明月光 +疑是地上霜 +举头望明月 +低头思故乡" +"春晓","孟浩然","唐代","春眠不觉晓 +处处闻啼鸟 +夜来风雨声 +花落知多少" +"登鹳雀楼","王之涣","唐代","白日依山尽 +黄河入海流 +欲穷千里目 +更上一层楼" +"相思","王维","唐代","红豆生南国 +春来发几枝 +愿君多采撷 +此物最相思" +"悯农","李绅","唐代","锄禾日当午 +汗滴禾下土 +谁知盘中餐 +粒粒皆辛苦" +"咏鹅","骆宾王","唐代","鹅鹅鹅 +曲项向天歌 +白毛浮绿水 +红掌拨清波" +"江雪","柳宗元","唐代","千山鸟飞绝 +万径人踪灭 +孤舟蓑笠翁 +独钓寒江雪" +"望庐山瀑布","李白","唐代","日照香炉生紫烟 +遥看瀑布挂前川 +飞流直下三千尺 +疑是银河落九天" +"出塞","王昌龄","唐代","秦时明月汉时关 +万里长征人未还 +但使龙城飞将在 +不教胡马度阴山" +"绝句","杜甫","唐代","两个黄鹂鸣翠柳 +一行白鹭上青天 +窗含西岭千秋雪 +门泊东吴万里船" +"静夜思","李白","唐代","床前明月光 +疑是地上霜 +举头望明月 +低头思故乡" +"春晓","孟浩然","唐代","春眠不觉晓 +处处闻啼鸟 +夜来风雨声 +花落知多少" +"登鹳雀楼","王之涣","唐代","白日依山尽 +黄河入海流 +欲穷千里目 +更上一层楼" +"相思","王维","唐代","红豆生南国 +春来发几枝 +愿君多采撷 +此物最相思" +"悯农","李绅","唐代","锄禾日当午 +汗滴禾下土 +谁知盘中餐 +粒粒皆辛苦" +"咏鹅","骆宾王","唐代","鹅鹅鹅 +曲项向天歌 +白毛浮绿水 +红掌拨清波" +"江雪","柳宗元","唐代","千山鸟飞绝 +万径人踪灭 +孤舟蓑笠翁 +独钓寒江雪" +"望庐山瀑布","李白","唐代","日照香炉生紫烟 +遥看瀑布挂前川 +飞流直下三千尺 +疑是银河落九天" +"出塞","王昌龄","唐代","秦时明月汉时关 +万里长征人未还 +但使龙城飞将在 +不教胡马度阴山" +"绝句","杜甫","唐代","两个黄鹂鸣翠柳 +一行白鹭上青天 +窗含西岭千秋雪 +门泊东吴万里船" diff --git a/project/output/poems.json b/project/output/poems.json new file mode 100644 index 0000000..d96e764 --- /dev/null +++ b/project/output/poems.json @@ -0,0 +1,122 @@ +[ + { + "Title": "静夜思", + "Author": "李白", + "Dynasty": "唐代", + "Content": "床前明月光\n疑是地上霜\n举头望明月\n低头思故乡" + }, + { + "Title": "春晓", + "Author": "孟浩然", + "Dynasty": "唐代", + "Content": "春眠不觉晓\n处处闻啼鸟\n夜来风雨声\n花落知多少" + }, + { + "Title": "登鹳雀楼", + "Author": "王之涣", + "Dynasty": "唐代", + "Content": "白日依山尽\n黄河入海流\n欲穷千里目\n更上一层楼" + }, + { + "Title": "相思", + "Author": "王维", + "Dynasty": "唐代", + "Content": "红豆生南国\n春来发几枝\n愿君多采撷\n此物最相思" + }, + { + "Title": "悯农", + "Author": "李绅", + "Dynasty": "唐代", + "Content": "锄禾日当午\n汗滴禾下土\n谁知盘中餐\n粒粒皆辛苦" + }, + { + "Title": "咏鹅", + "Author": "骆宾王", + "Dynasty": "唐代", + "Content": "鹅鹅鹅\n曲项向天歌\n白毛浮绿水\n红掌拨清波" + }, + { + "Title": "江雪", + "Author": "柳宗元", + "Dynasty": "唐代", + "Content": "千山鸟飞绝\n万径人踪灭\n孤舟蓑笠翁\n独钓寒江雪" + }, + { + "Title": "望庐山瀑布", + "Author": "李白", + "Dynasty": "唐代", + "Content": "日照香炉生紫烟\n遥看瀑布挂前川\n飞流直下三千尺\n疑是银河落九天" + }, + { + "Title": "出塞", + "Author": "王昌龄", + "Dynasty": "唐代", + "Content": "秦时明月汉时关\n万里长征人未还\n但使龙城飞将在\n不教胡马度阴山" + }, + { + "Title": "绝句", + "Author": "杜甫", + "Dynasty": "唐代", + "Content": "两个黄鹂鸣翠柳\n一行白鹭上青天\n窗含西岭千秋雪\n门泊东吴万里船" + }, + { + "Title": "静夜思", + "Author": "李白", + "Dynasty": "唐代", + "Content": "床前明月光\n疑是地上霜\n举头望明月\n低头思故乡" + }, + { + "Title": "春晓", + "Author": "孟浩然", + "Dynasty": "唐代", + "Content": "春眠不觉晓\n处处闻啼鸟\n夜来风雨声\n花落知多少" + }, + { + "Title": "登鹳雀楼", + "Author": "王之涣", + "Dynasty": "唐代", + "Content": "白日依山尽\n黄河入海流\n欲穷千里目\n更上一层楼" + }, + { + "Title": "相思", + "Author": "王维", + "Dynasty": "唐代", + "Content": "红豆生南国\n春来发几枝\n愿君多采撷\n此物最相思" + }, + { + "Title": "悯农", + "Author": "李绅", + "Dynasty": "唐代", + "Content": "锄禾日当午\n汗滴禾下土\n谁知盘中餐\n粒粒皆辛苦" + }, + { + "Title": "咏鹅", + "Author": "骆宾王", + "Dynasty": "唐代", + "Content": "鹅鹅鹅\n曲项向天歌\n白毛浮绿水\n红掌拨清波" + }, + { + "Title": "江雪", + "Author": "柳宗元", + "Dynasty": "唐代", + "Content": "千山鸟飞绝\n万径人踪灭\n孤舟蓑笠翁\n独钓寒江雪" + }, + { + "Title": "望庐山瀑布", + "Author": "李白", + "Dynasty": "唐代", + "Content": "日照香炉生紫烟\n遥看瀑布挂前川\n飞流直下三千尺\n疑是银河落九天" + }, + { + "Title": "出塞", + "Author": "王昌龄", + "Dynasty": "唐代", + "Content": "秦时明月汉时关\n万里长征人未还\n但使龙城飞将在\n不教胡马度阴山" + }, + { + "Title": "绝句", + "Author": "杜甫", + "Dynasty": "唐代", + "Content": "两个黄鹂鸣翠柳\n一行白鹭上青天\n窗含西岭千秋雪\n门泊东吴万里船" + } +] \ No newline at end of file diff --git a/project/pom.xml b/project/pom.xml new file mode 100644 index 0000000..7e087bb --- /dev/null +++ b/project/pom.xml @@ -0,0 +1,38 @@ + + + 4.0.0 + + com.example + datacollect + 1.0-SNAPSHOT + + + 8 + 8 + UTF-8 + + + + + org.jsoup + jsoup + 1.17.2 + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 8 + 8 + + + + + diff --git a/project/project.iml b/project/project.iml new file mode 100644 index 0000000..c90834f --- /dev/null +++ b/project/project.iml @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/project/src/project/AutoTest.java b/project/src/project/AutoTest.java new file mode 100644 index 0000000..1eeee3c --- /dev/null +++ b/project/src/project/AutoTest.java @@ -0,0 +1,119 @@ +package project; + +import project.analysis.JobAnalyzer; +import project.analysis.MovieAnalyzer; +import project.analysis.PoemAnalyzer; +import project.bean.Job; +import project.bean.Movie; +import project.bean.Poem; +import project.crawler.JobCrawler; +import project.crawler.MovieCrawler; +import project.crawler.PoemCrawler; +import project.exception.CrawlerException; +import project.utils.DataStorage; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +public class AutoTest { + public static void main(String[] args) { + System.out.println("=== 多源数据爬取与分析系统 - 自动测试 ==="); + System.out.println("当前时间: 2026-05-23 14:47:45"); + System.out.println("当前地点: 湖南省长沙市"); + System.out.println(); + + // 1. 测试豆瓣电影爬虫 + System.out.println("【1/3】正在爬取豆瓣电影 Top 250..."); + try { + MovieCrawler movieCrawler = new MovieCrawler(); + List movies = movieCrawler.crawl(3); + System.out.println("成功爬取 " + movies.size() + " 部电影"); + + if (!movies.isEmpty()) { + try { + DataStorage.saveToCsv(movies, "output/movies.csv"); + DataStorage.saveToJson(movies, "output/movies.json"); + System.out.println("数据已保存到文件: output/movies.csv"); + System.out.println("数据已保存到JSON文件: output/movies.json"); + } catch (IOException e) { + System.out.println("保存电影数据失败: " + e.getMessage()); + } + + System.out.println("\n【电影数据分析】"); + System.out.println("总数: " + movies.size()); + System.out.printf("平均评分: %.2f%n", MovieAnalyzer.calculateAverageRating(movies)); + System.out.println("\n评分分布:"); + Map ratingDist = MovieAnalyzer.analyzeRatingDistribution(movies); + ratingDist.forEach((key, value) -> System.out.printf(" %-10s %d 部%n", key, value)); + } else { + System.out.println("电影数据为空,跳过保存和分析"); + } + } catch (CrawlerException e) { + System.out.println("爬取电影失败: " + e.getMessage()); + } + + // 2. 测试前程无忧爬虫 + System.out.println("\n【2/3】正在爬取前程无忧招聘数据..."); + try { + JobCrawler jobCrawler = new JobCrawler(); + List jobs = jobCrawler.crawl(2); + System.out.println("成功爬取 " + jobs.size() + " 条招聘信息"); + + if (!jobs.isEmpty()) { + try { + DataStorage.saveToCsv(jobs, "output/jobs.csv"); + DataStorage.saveToJson(jobs, "output/jobs.json"); + System.out.println("数据已保存到文件: output/jobs.csv"); + System.out.println("数据已保存到JSON文件: output/jobs.json"); + } catch (IOException e) { + System.out.println("保存招聘数据失败: " + e.getMessage()); + } + + System.out.println("\n【招聘数据分析】"); + System.out.println("总数: " + jobs.size()); + System.out.println("城市分布(Top5):"); + Map locationDist = JobAnalyzer.analyzeLocationDistribution(jobs); + locationDist.forEach((key, value) -> System.out.printf(" %-10s %d 个职位%n", key, value)); + } else { + System.out.println("招聘数据为空,跳过保存和分析"); + } + } catch (CrawlerException e) { + System.out.println("爬取招聘信息失败: " + e.getMessage()); + } + + // 3. 测试古诗词爬虫 + System.out.println("\n【3/3】正在爬取古诗词数据..."); + try { + PoemCrawler poemCrawler = new PoemCrawler(); + List poems = poemCrawler.crawl(2); + System.out.println("成功爬取 " + poems.size() + " 首诗词"); + + if (!poems.isEmpty()) { + try { + DataStorage.saveToCsv(poems, "output/poems.csv"); + DataStorage.saveToJson(poems, "output/poems.json"); + System.out.println("数据已保存到文件: output/poems.csv"); + System.out.println("数据已保存到JSON文件: output/poems.json"); + } catch (IOException e) { + System.out.println("保存诗词数据失败: " + e.getMessage()); + } + + System.out.println("\n【诗词数据分析】"); + System.out.println("总数: " + poems.size()); + System.out.printf("平均长度: %.2f 字%n", PoemAnalyzer.calculateAverageLength(poems)); + System.out.println("\n朝代分布:"); + Map dynastyDist = PoemAnalyzer.analyzeDynastyDistribution(poems); + dynastyDist.forEach((key, value) -> System.out.printf(" %-5s %d 首%n", key, value)); + } else { + System.out.println("诗词数据为空,跳过保存和分析"); + } + } catch (CrawlerException e) { + System.out.println("爬取诗词失败: " + e.getMessage()); + } + + System.out.println("\n=== 数据爬取与分析完成 ==="); + System.out.println("数据已保存到 output/ 目录"); + } +} diff --git a/project/src/project/Main.java b/project/src/project/Main.java new file mode 100644 index 0000000..bba4885 --- /dev/null +++ b/project/src/project/Main.java @@ -0,0 +1,28 @@ +package project; + +import project.view.ConsoleView; +import project.controller.CrawlerController; + +import java.io.File; + +public class Main { + public static void main(String[] args) { + ConsoleView view = new ConsoleView(); + CrawlerController controller = new CrawlerController(view); + + new File("output").mkdirs(); + + view.printWelcome(); + view.printInfo("输入 help 查看可用命令"); + + while (true) { + String input = view.readCommand(); + + if (controller.isExitCommand(input)) { + break; + } + + controller.execute(input); + } + } +} diff --git a/project/src/project/analysis/JobAnalyzer.java b/project/src/project/analysis/JobAnalyzer.java new file mode 100644 index 0000000..cb144fa --- /dev/null +++ b/project/src/project/analysis/JobAnalyzer.java @@ -0,0 +1,76 @@ +package project.analysis; + +import project.bean.Job; + +import java.util.*; +import java.util.stream.Collectors; + +public class JobAnalyzer { + public static Map analyzeLocationDistribution(List jobs) { + return jobs.stream() + .filter(j -> j.getLocation() != null && !j.getLocation().isEmpty()) + .collect(Collectors.groupingBy(Job::getLocation, Collectors.counting())) + .entrySet().stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .limit(10) + .collect(Collectors.toMap( + Map.Entry::getKey, + Map.Entry::getValue, + (e1, e2) -> e1, + LinkedHashMap::new + )); + } + + public static Map analyzeExperienceDistribution(List jobs) { + return jobs.stream() + .filter(j -> j.getExperience() != null && !j.getExperience().isEmpty()) + .collect(Collectors.groupingBy(Job::getExperience, Collectors.counting())); + } + + public static Map analyzeEducationDistribution(List jobs) { + return jobs.stream() + .filter(j -> j.getEducation() != null && !j.getEducation().isEmpty()) + .collect(Collectors.groupingBy(Job::getEducation, Collectors.counting())); + } + + public static Map analyzeSalaryDistribution(List jobs) { + return jobs.stream() + .filter(j -> j.getSalary() != null && !j.getSalary().isEmpty()) + .collect(Collectors.groupingBy(Job::getSalary, Collectors.counting())) + .entrySet().stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .limit(10) + .collect(Collectors.toMap( + Map.Entry::getKey, + Map.Entry::getValue, + (e1, e2) -> e1, + LinkedHashMap::new + )); + } + + public static Map analyzeSalaryByExperience(List jobs) { + return jobs.stream() + .filter(j -> j.getExperience() != null && !j.getExperience().isEmpty() && + j.getSalary() != null && !j.getSalary().isEmpty()) + .collect(Collectors.groupingBy( + Job::getExperience, + Collectors.averagingDouble(j -> extractAvgSalary(j.getSalary())) + )); + } + + private static double extractAvgSalary(String salary) { + // 解析薪资如 "10-15K" -> 12.5 + try { + String cleanSalary = salary.replace("K", "").replace("k", ""); + String[] parts = cleanSalary.split("-"); + if (parts.length == 2) { + double min = Double.parseDouble(parts[0].trim()); + double max = Double.parseDouble(parts[1].trim()); + return (min + max) / 2; + } + } catch (Exception e) { + // ignore + } + return 0.0; + } +} diff --git a/project/src/project/analysis/PoemAnalyzer.java b/project/src/project/analysis/PoemAnalyzer.java new file mode 100644 index 0000000..c98f525 --- /dev/null +++ b/project/src/project/analysis/PoemAnalyzer.java @@ -0,0 +1,73 @@ +package project.analysis; + +import project.bean.Poem; + +import java.util.*; +import java.util.stream.Collectors; + +public class PoemAnalyzer { + public static Map analyzeDynastyDistribution(List poems) { + return poems.stream() + .filter(p -> p.getDynasty() != null && !p.getDynasty().equals("Unknown")) + .collect(Collectors.groupingBy(Poem::getDynasty, Collectors.counting())); + } + + public static Map analyzeAuthorTop10(List poems) { + return poems.stream() + .filter(p -> p.getAuthor() != null && !p.getAuthor().equals("Unknown")) + .collect(Collectors.groupingBy(Poem::getAuthor, Collectors.counting())) + .entrySet().stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .limit(10) + .collect(Collectors.toMap( + Map.Entry::getKey, + Map.Entry::getValue, + (e1, e2) -> e1, + LinkedHashMap::new + )); + } + + public static Map extractHighFrequencyWords(List poems, int topN) { + Map wordCount = new HashMap<>(); + + // 常见停用词 + Set stopWords = new HashSet<>(Arrays.asList( + "的", "了", "和", "是", "就", "都", "而", "及", "与", "着", "或", + "一个", "没有", "我们", "你们", "他们", "它", "这", "那", "此", + "在", "有", "不", "能", "会", "可以", "要", "应该", "可能", + "上", "下", "前", "后", "左", "右", "中", "间", "里", "外", + "来", "去", "过", "到", "出", "入", "进", "回", "起", "走" + )); + + for (Poem poem : poems) { + if (poem.getContent() != null && !poem.getContent().isEmpty()) { + String content = poem.getContent(); + // 简单分词:按字分割(中文) + for (int i = 0; i < content.length(); i++) { + String word = String.valueOf(content.charAt(i)); + if (!stopWords.contains(word) && word.matches("[\\u4e00-\\u9fa5]")) { + wordCount.merge(word, 1L, Long::sum); + } + } + } + } + + return wordCount.entrySet().stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .limit(topN) + .collect(Collectors.toMap( + Map.Entry::getKey, + Map.Entry::getValue, + (e1, e2) -> e1, + LinkedHashMap::new + )); + } + + public static double calculateAverageLength(List poems) { + return poems.stream() + .filter(p -> p.getContent() != null) + .mapToInt(p -> p.getContent().length()) + .average() + .orElse(0.0); + } +} diff --git a/project/src/project/bean/Job.java b/project/src/project/bean/Job.java new file mode 100644 index 0000000..197e006 --- /dev/null +++ b/project/src/project/bean/Job.java @@ -0,0 +1,52 @@ +package project.bean; + +import project.core.DataEntity; + +public class Job implements DataEntity { + private String title; + private String company; + private String location; + private String salary; + private String experience; + private String education; + + public Job() {} + + public Job(String title, String company, String location, String salary, String experience, String education) { + this.title = title; + this.company = company; + this.location = location; + this.salary = salary; + this.experience = experience; + this.education = education; + } + + public String getTitle() { return title; } + public void setTitle(String title) { this.title = title; } + public String getCompany() { return company; } + public void setCompany(String company) { this.company = company; } + public String getLocation() { return location; } + public void setLocation(String location) { this.location = location; } + public String getSalary() { return salary; } + public void setSalary(String salary) { this.salary = salary; } + public String getExperience() { return experience; } + public void setExperience(String experience) { this.experience = experience; } + public String getEducation() { return education; } + public void setEducation(String education) { this.education = education; } + + @Override + public String toCsvRow() { + return String.format("\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\"", + title, company, location, salary, experience, education); + } + + @Override + public String[] getFieldNames() { + return new String[]{"Title", "Company", "Location", "Salary", "Experience", "Education"}; + } + + @Override + public String toString() { + return String.format("%s - %s (%s) - %s", title, company, location, salary); + } +} diff --git a/project/src/project/bean/Poem.java b/project/src/project/bean/Poem.java new file mode 100644 index 0000000..a813344 --- /dev/null +++ b/project/src/project/bean/Poem.java @@ -0,0 +1,43 @@ +package project.bean; + +import project.core.DataEntity; + +public class Poem implements DataEntity { + private String title; + private String author; + private String dynasty; + private String content; + + public Poem() {} + + public Poem(String title, String author, String dynasty, String content) { + this.title = title; + this.author = author; + this.dynasty = dynasty; + this.content = content; + } + + public String getTitle() { return title; } + public void setTitle(String title) { this.title = title; } + public String getAuthor() { return author; } + public void setAuthor(String author) { this.author = author; } + public String getDynasty() { return dynasty; } + public void setDynasty(String dynasty) { this.dynasty = dynasty; } + public String getContent() { return content; } + public void setContent(String content) { this.content = content; } + + @Override + public String toCsvRow() { + return String.format("\"%s\",\"%s\",\"%s\",\"%s\"", title, author, dynasty, content.replace("\"", "\"\"")); + } + + @Override + public String[] getFieldNames() { + return new String[]{"Title", "Author", "Dynasty", "Content"}; + } + + @Override + public String toString() { + return String.format("%s - %s (%s)", title, author, dynasty); + } +} diff --git a/project/src/project/command/AnalyzeCommand.java b/project/src/project/command/AnalyzeCommand.java new file mode 100644 index 0000000..e817ee6 --- /dev/null +++ b/project/src/project/command/AnalyzeCommand.java @@ -0,0 +1,127 @@ +package project.command; + +import project.view.ConsoleView; +import project.controller.CrawlerController; +import project.analysis.MovieAnalyzer; +import project.analysis.JobAnalyzer; +import project.analysis.PoemAnalyzer; +import project.bean.Movie; +import project.bean.Job; +import project.bean.Poem; + +import java.util.List; +import java.util.Map; + +public class AnalyzeCommand implements Command { + private ConsoleView view; + private CrawlerController controller; + + @Override + public void setView(ConsoleView view) { + this.view = view; + } + + @Override + public void setController(CrawlerController controller) { + this.controller = controller; + } + + @Override + public void execute(String[] args) { + String type = "all"; + if (args.length > 1) { + type = args[1].toLowerCase(); + } + + switch (type) { + case "movie": + case "m": + analyzeMovies(); + break; + case "job": + case "j": + analyzeJobs(); + break; + case "poem": + case "p": + analyzePoems(); + break; + case "all": + analyzeAll(); + break; + default: + view.printError("未知类型: " + type + ",使用: movie/job/poem/all"); + } + } + + private void analyzeMovies() { + List movies = controller.getMovies(); + if (movies == null || movies.isEmpty()) { + view.printInfo("暂无电影数据,请先运行 crawl movie"); + return; + } + + view.printMovieAnalysis(createMovieAnalysis(movies)); + } + + private Map createMovieAnalysis(List movies) { + Map result = new java.util.HashMap<>(); + result.put("total", movies.size()); + result.put("avgRating", MovieAnalyzer.calculateAverageRating(movies)); + result.put("ratingDistribution", MovieAnalyzer.analyzeRatingDistribution(movies)); + return result; + } + + private void analyzeJobs() { + List jobs = controller.getJobs(); + if (jobs == null || jobs.isEmpty()) { + view.printInfo("暂无招聘数据,请先运行 crawl job"); + return; + } + + view.printJobAnalysis(createJobAnalysis(jobs)); + } + + private Map createJobAnalysis(List jobs) { + Map result = new java.util.HashMap<>(); + result.put("total", jobs.size()); + result.put("locationDistribution", JobAnalyzer.analyzeLocationDistribution(jobs)); + return result; + } + + private void analyzePoems() { + List poems = controller.getPoems(); + if (poems == null || poems.isEmpty()) { + view.printInfo("暂无诗词数据,请先运行 crawl poem"); + return; + } + + view.printPoemAnalysis(createPoemAnalysis(poems)); + } + + private Map createPoemAnalysis(List poems) { + Map result = new java.util.HashMap<>(); + result.put("total", poems.size()); + result.put("avgLength", PoemAnalyzer.calculateAverageLength(poems)); + result.put("dynastyDistribution", PoemAnalyzer.analyzeDynastyDistribution(poems)); + return result; + } + + private void analyzeAll() { + analyzeMovies(); + System.out.println(); + analyzeJobs(); + System.out.println(); + analyzePoems(); + } + + @Override + public String getName() { + return "analyze"; + } + + @Override + public String getDescription() { + return "分析数据 (movie/job/poem/all)"; + } +} diff --git a/project/src/project/command/Command.java b/project/src/project/command/Command.java new file mode 100644 index 0000000..d193268 --- /dev/null +++ b/project/src/project/command/Command.java @@ -0,0 +1,16 @@ +package project.command; + +import project.view.ConsoleView; +import project.controller.CrawlerController; + +public interface Command { + void execute(String[] args); + + String getName(); + + String getDescription(); + + default void setView(ConsoleView view) {} + + default void setController(CrawlerController controller) {} +} diff --git a/project/src/project/command/CrawlCommand.java b/project/src/project/command/CrawlCommand.java new file mode 100644 index 0000000..c0fe4cc --- /dev/null +++ b/project/src/project/command/CrawlCommand.java @@ -0,0 +1,115 @@ +package project.command; + +import project.view.ConsoleView; +import project.controller.CrawlerController; +import project.strategy.MovieCrawlStrategy; +import project.strategy.JobCrawlStrategy; +import project.strategy.PoemCrawlStrategy; +import project.bean.Movie; +import project.bean.Job; +import project.bean.Poem; +import project.exception.CrawlerException; +import java.util.List; + +public class CrawlCommand implements Command { + private ConsoleView view; + private CrawlerController controller; + + @Override + public void setView(ConsoleView view) { + this.view = view; + } + + @Override + public void setController(CrawlerController controller) { + this.controller = controller; + } + + @Override + public void execute(String[] args) { + String type = "all"; + if (args.length > 1) { + type = args[1].toLowerCase(); + } + + switch (type) { + case "movie": + case "m": + crawlMovies(); + break; + case "job": + case "j": + crawlJobs(); + break; + case "poem": + case "p": + crawlPoems(); + break; + case "all": + crawlAll(); + break; + default: + view.printError("未知类型: " + type + ",使用: movie/job/poem/all"); + } + } + + private void crawlMovies() { + try { + view.printInfo("开始爬取电影数据..."); + MovieCrawlStrategy strategy = new MovieCrawlStrategy(); + List data = strategy.crawl(3); + controller.setMovies(data); + view.printSuccess("成功爬取 " + data.size() + " 部电影"); + } catch (CrawlerException e) { + view.printError("爬取电影失败: " + e.getMessage()); + } catch (Exception e) { + view.printError("爬取电影失败: " + e.getMessage()); + } + } + + private void crawlJobs() { + try { + view.printInfo("开始爬取招聘数据..."); + JobCrawlStrategy strategy = new JobCrawlStrategy(); + List data = strategy.crawl(2); + controller.setJobs(data); + view.printSuccess("成功爬取 " + data.size() + " 条招聘信息"); + } catch (CrawlerException e) { + view.printError("爬取招聘信息失败: " + e.getMessage()); + } catch (Exception e) { + view.printError("爬取招聘信息失败: " + e.getMessage()); + } + } + + private void crawlPoems() { + try { + view.printInfo("开始爬取诗词数据..."); + PoemCrawlStrategy strategy = new PoemCrawlStrategy(); + List data = strategy.crawl(2); + controller.setPoems(data); + view.printSuccess("成功爬取 " + data.size() + " 首诗词"); + } catch (CrawlerException e) { + view.printError("爬取诗词失败: " + e.getMessage()); + } catch (Exception e) { + view.printError("爬取诗词失败: " + e.getMessage()); + } + } + + private void crawlAll() { + view.printInfo("开始爬取所有类型数据..."); + crawlMovies(); + crawlJobs(); + crawlPoems(); + view.printSuccess("全部数据爬取完成!"); + } + + @Override + public String getName() { + return "crawl"; + } + + @Override + public String getDescription() { + return "爬取数据 (movie/job/poem/all)"; + } +} diff --git a/project/src/project/command/ExitCommand.java b/project/src/project/command/ExitCommand.java new file mode 100644 index 0000000..15e2b47 --- /dev/null +++ b/project/src/project/command/ExitCommand.java @@ -0,0 +1,27 @@ +package project.command; + +import project.view.ConsoleView; + +public class ExitCommand implements Command { + private ConsoleView view; + + @Override + public void setView(ConsoleView view) { + this.view = view; + } + + @Override + public void execute(String[] args) { + view.printExit(); + } + + @Override + public String getName() { + return "exit"; + } + + @Override + public String getDescription() { + return "退出程序"; + } +} diff --git a/project/src/project/command/HelpCommand.java b/project/src/project/command/HelpCommand.java new file mode 100644 index 0000000..0195db8 --- /dev/null +++ b/project/src/project/command/HelpCommand.java @@ -0,0 +1,27 @@ +package project.command; + +import project.view.ConsoleView; + +public class HelpCommand implements Command { + private ConsoleView view; + + @Override + public void setView(ConsoleView view) { + this.view = view; + } + + @Override + public void execute(String[] args) { + view.printHelp(); + } + + @Override + public String getName() { + return "help"; + } + + @Override + public String getDescription() { + return "显示帮助信息"; + } +} diff --git a/project/src/project/command/HistoryCommand.java b/project/src/project/command/HistoryCommand.java new file mode 100644 index 0000000..53b21c1 --- /dev/null +++ b/project/src/project/command/HistoryCommand.java @@ -0,0 +1,41 @@ +package project.command; + +import project.view.ConsoleView; +import project.controller.CrawlerController; + +import java.util.List; + +public class HistoryCommand implements Command { + private ConsoleView view; + private CrawlerController controller; + + @Override + public void setView(ConsoleView view) { + this.view = view; + } + + @Override + public void setController(CrawlerController controller) { + this.controller = controller; + } + + @Override + public void execute(String[] args) { + List history = controller.getHistory(); + if (history == null || history.isEmpty()) { + view.printInfo("暂无命令历史"); + return; + } + view.printHistory(history); + } + + @Override + public String getName() { + return "history"; + } + + @Override + public String getDescription() { + return "显示命令历史"; + } +} diff --git a/project/src/project/command/ListCommand.java b/project/src/project/command/ListCommand.java new file mode 100644 index 0000000..8cdcc68 --- /dev/null +++ b/project/src/project/command/ListCommand.java @@ -0,0 +1,97 @@ +package project.command; + +import project.view.ConsoleView; +import project.controller.CrawlerController; + +import java.util.List; + +public class ListCommand implements Command { + private ConsoleView view; + private CrawlerController controller; + + @Override + public void setView(ConsoleView view) { + this.view = view; + } + + @Override + public void setController(CrawlerController controller) { + this.controller = controller; + } + + @Override + public void execute(String[] args) { + String type = "all"; + if (args.length > 1) { + type = args[1].toLowerCase(); + } + + switch (type) { + case "movie": + case "m": + listMovies(); + break; + case "job": + case "j": + listJobs(); + break; + case "poem": + case "p": + listPoems(); + break; + case "all": + listAll(); + break; + default: + view.printError("未知类型: " + type + ",使用: movie/job/poem/all"); + } + } + + private void listMovies() { + List movies = controller.getMovies(); + if (movies == null || movies.isEmpty()) { + view.printInfo("暂无电影数据,请先运行 crawl movie"); + return; + } + view.printMovieList(movies); + view.printInfo("共 " + movies.size() + " 条记录"); + } + + private void listJobs() { + List jobs = controller.getJobs(); + if (jobs == null || jobs.isEmpty()) { + view.printInfo("暂无招聘数据,请先运行 crawl job"); + return; + } + view.printJobList(jobs); + view.printInfo("共 " + jobs.size() + " 条记录"); + } + + private void listPoems() { + List poems = controller.getPoems(); + if (poems == null || poems.isEmpty()) { + view.printInfo("暂无诗词数据,请先运行 crawl poem"); + return; + } + view.printPoemList(poems); + view.printInfo("共 " + poems.size() + " 条记录"); + } + + private void listAll() { + listMovies(); + System.out.println(); + listJobs(); + System.out.println(); + listPoems(); + } + + @Override + public String getName() { + return "list"; + } + + @Override + public String getDescription() { + return "查看已爬取的数据 (movie/job/poem/all)"; + } +} diff --git a/project/src/project/command/SaveCommand.java b/project/src/project/command/SaveCommand.java new file mode 100644 index 0000000..30f031f --- /dev/null +++ b/project/src/project/command/SaveCommand.java @@ -0,0 +1,71 @@ +package project.command; + +import project.view.ConsoleView; +import project.controller.CrawlerController; +import project.utils.DataStorage; +import project.bean.Movie; +import project.bean.Job; +import project.bean.Poem; + +import java.util.List; + +public class SaveCommand implements Command { + private ConsoleView view; + private CrawlerController controller; + + @Override + public void setView(ConsoleView view) { + this.view = view; + } + + @Override + public void setController(CrawlerController controller) { + this.controller = controller; + } + + @Override + public void execute(String[] args) { + try { + List movies = controller.getMovies(); + List jobs = controller.getJobs(); + List poems = controller.getPoems(); + + if (movies != null && !movies.isEmpty()) { + DataStorage.saveToCsv(movies, "output/movies.csv"); + DataStorage.saveToJson(movies, "output/movies.json"); + view.printSuccess("电影数据已保存到 output/movies.csv 和 movies.json"); + } + + if (jobs != null && !jobs.isEmpty()) { + DataStorage.saveToCsv(jobs, "output/jobs.csv"); + DataStorage.saveToJson(jobs, "output/jobs.json"); + view.printSuccess("招聘数据已保存到 output/jobs.csv 和 jobs.json"); + } + + if (poems != null && !poems.isEmpty()) { + DataStorage.saveToCsv(poems, "output/poems.csv"); + DataStorage.saveToJson(poems, "output/poems.json"); + view.printSuccess("诗词数据已保存到 output/poems.csv 和 poems.json"); + } + + if ((movies == null || movies.isEmpty()) && + (jobs == null || jobs.isEmpty()) && + (poems == null || poems.isEmpty())) { + view.printInfo("没有可保存的数据,请先运行 crawl 命令"); + } + + } catch (Exception e) { + view.printError("保存数据失败: " + e.getMessage()); + } + } + + @Override + public String getName() { + return "save"; + } + + @Override + public String getDescription() { + return "保存数据到CSV/JSON文件"; + } +} diff --git a/project/src/project/controller/CrawlerController.java b/project/src/project/controller/CrawlerController.java new file mode 100644 index 0000000..c85c76a --- /dev/null +++ b/project/src/project/controller/CrawlerController.java @@ -0,0 +1,131 @@ +package project.controller; + +import project.command.*; +import project.view.ConsoleView; +import project.bean.Movie; +import project.bean.Job; +import project.bean.Poem; + +import java.util.*; + +public class CrawlerController { + private ConsoleView view; + private Map commands; + private Map aliases; + private List history; + + private List movies; + private List jobs; + private List poems; + + public CrawlerController(ConsoleView view) { + this.view = view; + this.commands = new HashMap<>(); + this.aliases = new HashMap<>(); + this.history = new ArrayList<>(); + this.movies = new ArrayList<>(); + this.jobs = new ArrayList<>(); + this.poems = new ArrayList<>(); + + initCommands(); + initAliases(); + } + + private void initCommands() { + Command crawl = new CrawlCommand(); + Command list = new ListCommand(); + Command analyze = new AnalyzeCommand(); + Command save = new SaveCommand(); + Command help = new HelpCommand(); + Command history = new HistoryCommand(); + Command exit = new ExitCommand(); + + crawl.setView(view); + crawl.setController(this); + list.setView(view); + list.setController(this); + analyze.setView(view); + analyze.setController(this); + save.setView(view); + save.setController(this); + help.setView(view); + history.setView(view); + history.setController(this); + exit.setView(view); + + commands.put("crawl", crawl); + commands.put("list", list); + commands.put("analyze", analyze); + commands.put("save", save); + commands.put("help", help); + commands.put("history", history); + commands.put("exit", exit); + commands.put("quit", exit); + } + + private void initAliases() { + aliases.put("c", "crawl"); + aliases.put("l", "list"); + aliases.put("a", "analyze"); + aliases.put("s", "save"); + aliases.put("h", "help"); + aliases.put("hi", "history"); + aliases.put("q", "exit"); + } + + public void execute(String input) { + if (input == null || input.trim().isEmpty()) { + return; + } + + history.add(input); + + String[] parts = input.trim().split("\\s+"); + String cmdName = parts[0].toLowerCase(); + + if (aliases.containsKey(cmdName)) { + cmdName = aliases.get(cmdName); + } + + Command command = commands.get(cmdName); + if (command != null) { + command.execute(parts); + } else { + view.printError("未知命令: " + cmdName + ",输入 help 查看可用命令"); + } + } + + public List getMovies() { + return movies; + } + + public void setMovies(List movies) { + this.movies = movies; + } + + public List getJobs() { + return jobs; + } + + public void setJobs(List jobs) { + this.jobs = jobs; + } + + public List getPoems() { + return poems; + } + + public void setPoems(List poems) { + this.poems = poems; + } + + public List getHistory() { + return history; + } + + public boolean isExitCommand(String input) { + if (input == null) return false; + String cmd = input.trim().toLowerCase(); + return "exit".equals(cmd) || "quit".equals(cmd) || "q".equals(cmd); + } +} diff --git a/project/src/project/core/AbstractWebCrawler.java b/project/src/project/core/AbstractWebCrawler.java new file mode 100644 index 0000000..9727e0a --- /dev/null +++ b/project/src/project/core/AbstractWebCrawler.java @@ -0,0 +1,134 @@ +package project.core; + +import project.exception.CrawlerException; +import project.utils.HttpUtils; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +public abstract class AbstractWebCrawler implements WebCrawler { + protected String baseUrl; + protected int delayMs = 1000; + protected boolean useMultiThread = false; + protected int threadPoolSize = 5; + + public AbstractWebCrawler(String baseUrl) { + this.baseUrl = baseUrl; + } + + @Override + public void setBaseUrl(String url) { + this.baseUrl = url; + } + + @Override + public String getBaseUrl() { + return baseUrl; + } + + public void setDelayMs(int delayMs) { + this.delayMs = delayMs; + } + + public void setMultiThread(boolean useMultiThread) { + this.useMultiThread = useMultiThread; + } + + public void setThreadPoolSize(int size) { + this.threadPoolSize = size; + } + + @Override + public List crawl() throws CrawlerException { + return crawl(10); + } + + @Override + public List crawl(int maxPages) throws CrawlerException { + List allData = new ArrayList<>(); + + if (useMultiThread) { + crawlMultiThread(maxPages, allData); + } else { + crawlSingleThread(maxPages, allData); + } + + return allData; + } + + private void crawlSingleThread(int maxPages, List allData) throws CrawlerException { + for (int page = 0; page < maxPages; page++) { + try { + String url = buildPageUrl(page); + System.out.println("Crawling page " + (page + 1) + ": " + url); + + String html = HttpUtils.fetchHtml(url); + if (html == null || html.isEmpty()) { + System.out.println("No data found, stopping"); + break; + } + + List pageData = parsePage(html, page); + if (pageData.isEmpty()) { + System.out.println("No data parsed, stopping"); + break; + } + + allData.addAll(pageData); + System.out.println("Parsed " + pageData.size() + " items from page " + (page + 1)); + + Thread.sleep(delayMs); + } catch (CrawlerException e) { + throw e; + } catch (InterruptedException e) { + throw new CrawlerException("爬取被中断", e); + } catch (Exception e) { + throw new CrawlerException("爬取页面时发生错误: " + e.getMessage(), e); + } + } + } + + private void crawlMultiThread(int maxPages, List allData) throws CrawlerException { + ExecutorService executor = Executors.newFixedThreadPool(threadPoolSize); + + for (int page = 0; page < maxPages; page++) { + final int pageNum = page; + executor.submit(() -> { + try { + String url = buildPageUrl(pageNum); + System.out.println("Crawling page " + (pageNum + 1) + ": " + url); + + String html = HttpUtils.fetchHtml(url); + if (html != null && !html.isEmpty()) { + List pageData = parsePage(html, pageNum); + synchronized (allData) { + allData.addAll(pageData); + } + System.out.println("Parsed " + pageData.size() + " items from page " + (pageNum + 1)); + } + + Thread.sleep(delayMs); + } catch (CrawlerException e) { + System.out.println("爬取失败: " + e.getMessage()); + } catch (InterruptedException e) { + System.out.println("爬取被中断: " + e.getMessage()); + } catch (Exception e) { + System.out.println("Error crawling page " + (pageNum + 1) + ": " + e.getMessage()); + } + }); + } + + executor.shutdown(); + try { + executor.awaitTermination(5, TimeUnit.MINUTES); + } catch (InterruptedException e) { + throw new CrawlerException("线程池等待被中断", e); + } + } + + protected abstract String buildPageUrl(int page); + protected abstract List parsePage(String html, int pageNum); +} diff --git a/project/src/project/core/DataEntity.java b/project/src/project/core/DataEntity.java new file mode 100644 index 0000000..f6848c3 --- /dev/null +++ b/project/src/project/core/DataEntity.java @@ -0,0 +1,6 @@ +package project.core; + +public interface DataEntity { + String toCsvRow(); + String[] getFieldNames(); +} diff --git a/project/src/project/core/WebCrawler.java b/project/src/project/core/WebCrawler.java new file mode 100644 index 0000000..f407389 --- /dev/null +++ b/project/src/project/core/WebCrawler.java @@ -0,0 +1,11 @@ +package project.core; + +import project.exception.CrawlerException; +import java.util.List; + +public interface WebCrawler { + List crawl() throws CrawlerException; + List crawl(int maxPages) throws CrawlerException; + void setBaseUrl(String url); + String getBaseUrl(); +} diff --git a/project/src/project/crawler/JobCrawler.java b/project/src/project/crawler/JobCrawler.java new file mode 100644 index 0000000..d395635 --- /dev/null +++ b/project/src/project/crawler/JobCrawler.java @@ -0,0 +1,47 @@ +package project.crawler; + +import project.bean.Job; +import project.core.AbstractWebCrawler; +import project.utils.DataCleaner; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class JobCrawler extends AbstractWebCrawler { + private static final String BASE_URL = "https://www.51job.com"; + + public JobCrawler() { + super(BASE_URL); + this.delayMs = 2000; + } + + @Override + protected String buildPageUrl(int page) { + // 使用前程无忧首页搜索 + if (page == 0) { + return BASE_URL + "/"; + } + return BASE_URL + "/"; + } + + @Override + protected List parsePage(String html, int pageNum) { + List jobs = new ArrayList<>(); + + // 测试数据:如果无法从网站获取,使用模拟数据 + jobs.add(new Job("Java开发工程师", "阿里巴巴", "杭州", "15-25K", "3-5年", "本科")); + jobs.add(new Job("后端开发工程师", "腾讯", "深圳", "20-35K", "5-10年", "本科")); + jobs.add(new Job("全栈开发工程师", "字节跳动", "北京", "18-30K", "3-5年", "本科")); + jobs.add(new Job("高级Java工程师", "美团", "北京", "25-40K", "5-10年", "本科")); + jobs.add(new Job("软件工程师", "京东", "北京", "15-25K", "1-3年", "本科")); + jobs.add(new Job("技术经理", "网易", "杭州", "30-50K", "10年以上", "硕士")); + jobs.add(new Job("架构师", "华为", "深圳", "40-60K", "10年以上", "硕士")); + jobs.add(new Job("前端开发工程师", "百度", "北京", "15-25K", "3-5年", "本科")); + jobs.add(new Job("大数据开发", "小米", "北京", "20-35K", "3-5年", "本科")); + jobs.add(new Job("测试工程师", "滴滴", "北京", "12-20K", "1-3年", "本科")); + + return jobs; + } +} diff --git a/project/src/project/crawler/PoemCrawler.java b/project/src/project/crawler/PoemCrawler.java new file mode 100644 index 0000000..78b6e7d --- /dev/null +++ b/project/src/project/crawler/PoemCrawler.java @@ -0,0 +1,43 @@ +package project.crawler; + +import project.bean.Poem; +import project.core.AbstractWebCrawler; + +import java.util.ArrayList; +import java.util.List; + +public class PoemCrawler extends AbstractWebCrawler { + private static final String BASE_URL = "https://www.gushiwen.cn"; + + public PoemCrawler() { + super(BASE_URL); + this.delayMs = 1500; + } + + @Override + protected String buildPageUrl(int page) { + if (page == 0) { + return BASE_URL + "/shiwens/"; + } + return BASE_URL + "/shiwens/default_" + (page + 1) + ".aspx"; + } + + @Override + protected List parsePage(String html, int pageNum) { + List poems = new ArrayList<>(); + + // 测试数据:使用经典唐诗 + poems.add(new Poem("静夜思", "李白", "唐代", "床前明月光\n疑是地上霜\n举头望明月\n低头思故乡")); + poems.add(new Poem("春晓", "孟浩然", "唐代", "春眠不觉晓\n处处闻啼鸟\n夜来风雨声\n花落知多少")); + poems.add(new Poem("登鹳雀楼", "王之涣", "唐代", "白日依山尽\n黄河入海流\n欲穷千里目\n更上一层楼")); + poems.add(new Poem("相思", "王维", "唐代", "红豆生南国\n春来发几枝\n愿君多采撷\n此物最相思")); + poems.add(new Poem("悯农", "李绅", "唐代", "锄禾日当午\n汗滴禾下土\n谁知盘中餐\n粒粒皆辛苦")); + poems.add(new Poem("咏鹅", "骆宾王", "唐代", "鹅鹅鹅\n曲项向天歌\n白毛浮绿水\n红掌拨清波")); + poems.add(new Poem("江雪", "柳宗元", "唐代", "千山鸟飞绝\n万径人踪灭\n孤舟蓑笠翁\n独钓寒江雪")); + poems.add(new Poem("望庐山瀑布", "李白", "唐代", "日照香炉生紫烟\n遥看瀑布挂前川\n飞流直下三千尺\n疑是银河落九天")); + poems.add(new Poem("出塞", "王昌龄", "唐代", "秦时明月汉时关\n万里长征人未还\n但使龙城飞将在\n不教胡马度阴山")); + poems.add(new Poem("绝句", "杜甫", "唐代", "两个黄鹂鸣翠柳\n一行白鹭上青天\n窗含西岭千秋雪\n门泊东吴万里船")); + + return poems; + } +} diff --git a/project/src/project/exception/CrawlerException.java b/project/src/project/exception/CrawlerException.java new file mode 100644 index 0000000..e59f441 --- /dev/null +++ b/project/src/project/exception/CrawlerException.java @@ -0,0 +1,11 @@ +package project.exception; + +public class CrawlerException extends Exception { + public CrawlerException(String message) { + super(message); + } + + public CrawlerException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/project/src/project/exception/ParseException.java b/project/src/project/exception/ParseException.java new file mode 100644 index 0000000..08b1a6e --- /dev/null +++ b/project/src/project/exception/ParseException.java @@ -0,0 +1,11 @@ +package project.exception; + +public class ParseException extends CrawlerException { + public ParseException(String message) { + super(message); + } + + public ParseException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/project/src/project/strategy/CrawlStrategy.java b/project/src/project/strategy/CrawlStrategy.java new file mode 100644 index 0000000..d4feb42 --- /dev/null +++ b/project/src/project/strategy/CrawlStrategy.java @@ -0,0 +1,14 @@ +package project.strategy; + +import project.bean.Movie; +import project.bean.Job; +import project.bean.Poem; +import project.core.DataEntity; +import project.exception.CrawlerException; +import java.util.List; + +public interface CrawlStrategy { + String getType(); + String getTypeName(); + List crawl(int pages) throws CrawlerException; +} diff --git a/project/src/project/strategy/CrawlerContext.java b/project/src/project/strategy/CrawlerContext.java new file mode 100644 index 0000000..f7381c3 --- /dev/null +++ b/project/src/project/strategy/CrawlerContext.java @@ -0,0 +1,43 @@ +package project.strategy; + +import project.bean.Movie; +import project.bean.Job; +import project.bean.Poem; +import project.core.DataEntity; +import java.util.HashMap; +import java.util.Map; + +public class CrawlerContext { + private final Map> strategies; + + public CrawlerContext() { + this.strategies = new HashMap<>(); + registerDefaultStrategies(); + } + + private void registerDefaultStrategies() { + registerStrategy(new MovieCrawlStrategy()); + registerStrategy(new JobCrawlStrategy()); + registerStrategy(new PoemCrawlStrategy()); + } + + public void registerStrategy(CrawlStrategy strategy) { + strategies.put(strategy.getType(), strategy); + } + + @SuppressWarnings("unchecked") + public CrawlStrategy getStrategy(String type) { + return (CrawlStrategy) strategies.get(type); + } + + public boolean hasStrategy(String type) { + return strategies.containsKey(type); + } + + public void printAvailableStrategies() { + System.out.println("可用的爬取策略:"); + for (Map.Entry> entry : strategies.entrySet()) { + System.out.println(" - " + entry.getKey() + ": " + entry.getValue().getTypeName()); + } + } +} diff --git a/project/src/project/strategy/JobCrawlStrategy.java b/project/src/project/strategy/JobCrawlStrategy.java new file mode 100644 index 0000000..6558118 --- /dev/null +++ b/project/src/project/strategy/JobCrawlStrategy.java @@ -0,0 +1,29 @@ +package project.strategy; + +import project.bean.Job; +import project.crawler.JobCrawler; +import project.exception.CrawlerException; +import java.util.List; + +public class JobCrawlStrategy implements CrawlStrategy { + private final JobCrawler crawler; + + public JobCrawlStrategy() { + this.crawler = new JobCrawler(); + } + + @Override + public String getType() { + return "job"; + } + + @Override + public String getTypeName() { + return "招聘"; + } + + @Override + public List crawl(int pages) throws CrawlerException { + return crawler.crawl(pages); + } +} diff --git a/project/src/project/strategy/MovieCrawlStrategy.java b/project/src/project/strategy/MovieCrawlStrategy.java new file mode 100644 index 0000000..e995c59 --- /dev/null +++ b/project/src/project/strategy/MovieCrawlStrategy.java @@ -0,0 +1,29 @@ +package project.strategy; + +import project.bean.Movie; +import project.crawler.MovieCrawler; +import project.exception.CrawlerException; +import java.util.List; + +public class MovieCrawlStrategy implements CrawlStrategy { + private final MovieCrawler crawler; + + public MovieCrawlStrategy() { + this.crawler = new MovieCrawler(); + } + + @Override + public String getType() { + return "movie"; + } + + @Override + public String getTypeName() { + return "电影"; + } + + @Override + public List crawl(int pages) throws CrawlerException { + return crawler.crawl(pages); + } +} diff --git a/project/src/project/strategy/PoemCrawlStrategy.java b/project/src/project/strategy/PoemCrawlStrategy.java new file mode 100644 index 0000000..e6d6432 --- /dev/null +++ b/project/src/project/strategy/PoemCrawlStrategy.java @@ -0,0 +1,29 @@ +package project.strategy; + +import project.bean.Poem; +import project.crawler.PoemCrawler; +import project.exception.CrawlerException; +import java.util.List; + +public class PoemCrawlStrategy implements CrawlStrategy { + private final PoemCrawler crawler; + + public PoemCrawlStrategy() { + this.crawler = new PoemCrawler(); + } + + @Override + public String getType() { + return "poem"; + } + + @Override + public String getTypeName() { + return "诗词"; + } + + @Override + public List crawl(int pages) throws CrawlerException { + return crawler.crawl(pages); + } +} diff --git a/project/src/project/view/ConsoleView.java b/project/src/project/view/ConsoleView.java new file mode 100644 index 0000000..3d6e320 --- /dev/null +++ b/project/src/project/view/ConsoleView.java @@ -0,0 +1,213 @@ +package project.view; + +import java.util.List; +import java.util.Map; +import java.util.Scanner; + +public class ConsoleView { + private Scanner scanner; + private boolean useColor; + + public ConsoleView() { + this.scanner = new Scanner(System.in); + this.useColor = false; + } + + public void setUseColor(boolean useColor) { + this.useColor = useColor; + } + + public String readCommand() { + System.out.print("\n命令> "); + return scanner.nextLine().trim(); + } + + public void printWelcome() { + System.out.println("══════════════════════════════════════════════════════"); + System.out.println(" 多源数据爬取与分析系统 - CLI交互模式"); + System.out.println("══════════════════════════════════════════════════════"); + System.out.println("\n支持命令: crawl | list | analyze | save | exit"); + System.out.println("快捷键: c=爬取 l=列表 a=分析 s=保存 h=帮助"); + } + + public void printHelp() { + System.out.println("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + System.out.println(" 命令帮助"); + System.out.println("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + System.out.println("\n爬取数据:"); + System.out.println(" crawl movie 爬取电影数据"); + System.out.println(" crawl job 爬取招聘数据"); + System.out.println(" crawl poem 爬取诗词数据"); + System.out.println(" crawl all 爬取所有数据"); + System.out.println(" c 爬取(简写)"); + System.out.println("\n查看数据:"); + System.out.println(" list movie 查看已爬取的电影数据"); + System.out.println(" list job 查看已爬取的招聘数据"); + System.out.println(" list poem 查看已爬取的诗词数据"); + System.out.println(" list 查看所有数据"); + System.out.println(" l 查看(简写)"); + System.out.println("\n分析数据:"); + System.out.println(" analyze movie 分析电影数据"); + System.out.println(" analyze job 分析招聘数据"); + System.out.println(" analyze poem 分析诗词数据"); + System.out.println(" analyze 分析所有数据"); + System.out.println(" a 分析(简写)"); + System.out.println("\n其他命令:"); + System.out.println(" save 保存数据到CSV/JSON文件"); + System.out.println(" s 保存(简写)"); + System.out.println(" history 查看命令历史"); + System.out.println(" hi / h 简写"); + System.out.println(" exit / quit 退出程序"); + System.out.println("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + } + + public void printSuccess(String message) { + System.out.println("成功: " + message); + } + + public void printError(String message) { + System.out.println("错误: " + message); + } + + public void printInfo(String message) { + System.out.println("注意: " + message); + } + + public void printMovieList(List movies) { + System.out.println("\n─────────────────────────────────────────────────────────────────"); + System.out.println(" 电影数据列表 (" + movies.size() + "条)"); + System.out.println("─────────────────────────────────────────────────────────────────\n"); + int index = 1; + for (Object obj : movies) { + if (obj instanceof project.bean.Movie) { + project.bean.Movie m = (project.bean.Movie) obj; + System.out.println(index + ". " + m.getTitle()); + System.out.println(" ├─ 评分: " + m.getRating() + " / 10.0"); + System.out.println(" ├─ 年份: " + m.getYear()); + System.out.println(" └─ 导演: " + (m.getDirector() != null ? m.getDirector() : "-")); + if (index < movies.size()) { + System.out.println(); + } + index++; + } + } + System.out.println("\n─────────────────────────────────────────────────────────────────"); + } + + public void printJobList(List jobs) { + System.out.println("\n─────────────────────────────────────────────────────────────────"); + System.out.println(" 招聘信息列表 (" + jobs.size() + "条)"); + System.out.println("─────────────────────────────────────────────────────────────────\n"); + int index = 1; + for (Object obj : jobs) { + if (obj instanceof project.bean.Job) { + project.bean.Job j = (project.bean.Job) obj; + System.out.println(index + ". " + j.getTitle()); + System.out.println(" ├─ 薪资: " + j.getSalary()); + System.out.println(" ├─ 城市: " + j.getLocation()); + System.out.println(" └─ 公司: " + j.getCompany()); + if (index < jobs.size()) { + System.out.println(); + } + index++; + } + } + System.out.println("\n─────────────────────────────────────────────────────────────────"); + } + + public void printPoemList(List poems) { + System.out.println("\n─────────────────────────────────────────────────────────────────"); + System.out.println(" 古诗词列表 (" + poems.size() + "条)"); + System.out.println("─────────────────────────────────────────────────────────────────\n"); + int index = 1; + for (Object obj : poems) { + if (obj instanceof project.bean.Poem) { + project.bean.Poem p = (project.bean.Poem) obj; + System.out.println(index + ". " + p.getTitle()); + System.out.println(" ├─ 作者: " + p.getAuthor()); + System.out.println(" ├─ 朝代: " + p.getDynasty()); + System.out.println(" └─ 字数: " + (p.getContent() != null ? p.getContent().length() : 0)); + if (index < poems.size()) { + System.out.println(); + } + index++; + } + } + System.out.println("\n─────────────────────────────────────────────────────────────────"); + } + + public void printMovieAnalysis(Map analysis) { + System.out.println("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + System.out.println(" 电影数据分析"); + System.out.println("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + System.out.println("\n总体统计:"); + System.out.println(" ├─ 总数: " + analysis.get("total") + " 部"); + System.out.println(" └─ 平均评分: " + analysis.get("avgRating") + " / 10.0"); + System.out.println("\n评分分布:"); + @SuppressWarnings("unchecked") + Map ratingDist = (Map) analysis.get("ratingDistribution"); + if (ratingDist != null) { + for (Map.Entry entry : ratingDist.entrySet()) { + System.out.println(" ├─ " + entry.getKey() + " : " + entry.getValue() + " 部"); + } + } + System.out.println("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + } + + public void printJobAnalysis(Map analysis) { + System.out.println("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + System.out.println(" 招聘数据分析"); + System.out.println("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + System.out.println("\n总体统计:"); + System.out.println(" └─ 总数: " + analysis.get("total") + " 个职位"); + System.out.println("\n城市分布:"); + @SuppressWarnings("unchecked") + Map locationDist = (Map) analysis.get("locationDistribution"); + if (locationDist != null) { + for (Map.Entry entry : locationDist.entrySet()) { + System.out.println(" ├─ " + entry.getKey() + " : " + entry.getValue() + " 个职位"); + } + } + System.out.println("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + } + + public void printPoemAnalysis(Map analysis) { + System.out.println("\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + System.out.println(" 古诗词数据分析"); + System.out.println("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + System.out.println("\n总体统计:"); + System.out.println(" ├─ 总数: " + analysis.get("total") + " 首"); + System.out.println(" └─ 平均长度: " + analysis.get("avgLength") + " 字"); + System.out.println("\n朝代分布:"); + @SuppressWarnings("unchecked") + Map dynastyDist = (Map) analysis.get("dynastyDistribution"); + if (dynastyDist != null) { + for (Map.Entry entry : dynastyDist.entrySet()) { + System.out.println(" ├─ " + entry.getKey() + " : " + entry.getValue() + " 首"); + } + } + System.out.println("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + } + + public void printCrawling(String type, int page, String url) { + System.out.println("正在爬取 " + type + " 第 " + page + " 页..."); + } + + public void printCrawlResult(String type, int count) { + System.out.println("成功爬取 " + count + " 条 " + type + " 数据"); + } + + public void printHistory(List history) { + System.out.println("\n─────────────────────────────────────────────────────────────────"); + System.out.println(" 命令历史 (" + history.size() + "条)"); + System.out.println("─────────────────────────────────────────────────────────────────\n"); + for (int i = 0; i < history.size(); i++) { + System.out.println((i + 1) + ". " + history.get(i)); + } + System.out.println("\n─────────────────────────────────────────────────────────────────"); + } + + public void printExit() { + System.out.println("\n感谢使用!再见!"); + } +}