39 changed files with 1360 additions and 0 deletions
Binary file not shown.
@ -0,0 +1,29 @@ |
|||||
|
### IntelliJ IDEA ### |
||||
|
out/ |
||||
|
!**/src/main/**/out/ |
||||
|
!**/src/test/**/out/ |
||||
|
|
||||
|
### Eclipse ### |
||||
|
.apt_generated |
||||
|
.classpath |
||||
|
.factorypath |
||||
|
.project |
||||
|
.settings |
||||
|
.springBeans |
||||
|
.sts4-cache |
||||
|
bin/ |
||||
|
!**/src/main/**/bin/ |
||||
|
!**/src/test/**/bin/ |
||||
|
|
||||
|
### NetBeans ### |
||||
|
/nbproject/private/ |
||||
|
/nbbuild/ |
||||
|
/dist/ |
||||
|
/nbdist/ |
||||
|
/.nb-gradle/ |
||||
|
|
||||
|
### VS Code ### |
||||
|
.vscode/ |
||||
|
|
||||
|
### Mac OS ### |
||||
|
.DS_Store |
||||
@ -0,0 +1,10 @@ |
|||||
|
# 默认忽略的文件 |
||||
|
/shelf/ |
||||
|
/workspace.xml |
||||
|
# 基于编辑器的 HTTP 客户端请求 |
||||
|
/httpRequests/ |
||||
|
# 依赖于环境的 Maven 主目录路径 |
||||
|
/mavenHomeManager.xml |
||||
|
# Datasource local storage ignored files |
||||
|
/dataSources/ |
||||
|
/dataSources.local.xml |
||||
@ -0,0 +1,9 @@ |
|||||
|
<component name="libraryTable"> |
||||
|
<library name="fastjson2-2.0.32"> |
||||
|
<CLASSES> |
||||
|
<root url="jar://$USER_HOME$/Downloads/fastjson2-2.0.32.jar!/" /> |
||||
|
</CLASSES> |
||||
|
<JAVADOC /> |
||||
|
<SOURCES /> |
||||
|
</library> |
||||
|
</component> |
||||
@ -0,0 +1,9 @@ |
|||||
|
<component name="libraryTable"> |
||||
|
<library name="jcommon-1.0.24"> |
||||
|
<CLASSES> |
||||
|
<root url="jar://$USER_HOME$/Downloads/jcommon-1.0.24.jar!/" /> |
||||
|
</CLASSES> |
||||
|
<JAVADOC /> |
||||
|
<SOURCES /> |
||||
|
</library> |
||||
|
</component> |
||||
@ -0,0 +1,9 @@ |
|||||
|
<component name="libraryTable"> |
||||
|
<library name="jfreechart-1.5.3"> |
||||
|
<CLASSES> |
||||
|
<root url="jar://$USER_HOME$/Downloads/jfreechart-1.5.3.jar!/" /> |
||||
|
</CLASSES> |
||||
|
<JAVADOC /> |
||||
|
<SOURCES /> |
||||
|
</library> |
||||
|
</component> |
||||
@ -0,0 +1,9 @@ |
|||||
|
<component name="libraryTable"> |
||||
|
<library name="jsoup-1.17.2"> |
||||
|
<CLASSES> |
||||
|
<root url="jar://$USER_HOME$/Downloads/jsoup-1.17.2.jar!/" /> |
||||
|
</CLASSES> |
||||
|
<JAVADOC /> |
||||
|
<SOURCES /> |
||||
|
</library> |
||||
|
</component> |
||||
@ -0,0 +1,9 @@ |
|||||
|
<component name="libraryTable"> |
||||
|
<library name="kumo-core-1.12"> |
||||
|
<CLASSES> |
||||
|
<root url="jar://$USER_HOME$/Downloads/kumo-core-1.12.jar!/" /> |
||||
|
</CLASSES> |
||||
|
<JAVADOC /> |
||||
|
<SOURCES /> |
||||
|
</library> |
||||
|
</component> |
||||
@ -0,0 +1,9 @@ |
|||||
|
<component name="libraryTable"> |
||||
|
<library name="logback-classic-1.4.11"> |
||||
|
<CLASSES> |
||||
|
<root url="jar://$USER_HOME$/Downloads/logback-classic-1.4.11.jar!/" /> |
||||
|
</CLASSES> |
||||
|
<JAVADOC /> |
||||
|
<SOURCES /> |
||||
|
</library> |
||||
|
</component> |
||||
@ -0,0 +1,9 @@ |
|||||
|
<component name="libraryTable"> |
||||
|
<library name="logback-core-1.4.11"> |
||||
|
<CLASSES> |
||||
|
<root url="jar://$USER_HOME$/Downloads/logback-core-1.4.11.jar!/" /> |
||||
|
</CLASSES> |
||||
|
<JAVADOC /> |
||||
|
<SOURCES /> |
||||
|
</library> |
||||
|
</component> |
||||
@ -0,0 +1,9 @@ |
|||||
|
<component name="libraryTable"> |
||||
|
<library name="slf4j-api-2.0.9"> |
||||
|
<CLASSES> |
||||
|
<root url="jar://$USER_HOME$/Downloads/slf4j-api-2.0.9.jar!/" /> |
||||
|
</CLASSES> |
||||
|
<JAVADOC /> |
||||
|
<SOURCES /> |
||||
|
</library> |
||||
|
</component> |
||||
@ -0,0 +1,6 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project version="4"> |
||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="openjdk-26" project-jdk-type="JavaSDK"> |
||||
|
<output url="file://$PROJECT_DIR$/out" /> |
||||
|
</component> |
||||
|
</project> |
||||
@ -0,0 +1,8 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project version="4"> |
||||
|
<component name="ProjectModuleManager"> |
||||
|
<modules> |
||||
|
<module fileurl="file://$PROJECT_DIR$/CrawlerMain2.iml" filepath="$PROJECT_DIR$/CrawlerMain2.iml" /> |
||||
|
</modules> |
||||
|
</component> |
||||
|
</project> |
||||
@ -0,0 +1,19 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<module type="JAVA_MODULE" version="4"> |
||||
|
<component name="NewModuleRootManager" inherit-compiler-output="true"> |
||||
|
<exclude-output /> |
||||
|
<content url="file://$MODULE_DIR$"> |
||||
|
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" /> |
||||
|
</content> |
||||
|
<orderEntry type="inheritedJdk" /> |
||||
|
<orderEntry type="sourceFolder" forTests="false" /> |
||||
|
<orderEntry type="library" name="jsoup-1.17.2" level="project" /> |
||||
|
<orderEntry type="library" name="jfreechart-1.5.3" level="project" /> |
||||
|
<orderEntry type="library" name="jcommon-1.0.24" level="project" /> |
||||
|
<orderEntry type="library" name="kumo-core-1.12" level="project" /> |
||||
|
<orderEntry type="library" name="logback-classic-1.4.11" level="project" /> |
||||
|
<orderEntry type="library" name="logback-core-1.4.11" level="project" /> |
||||
|
<orderEntry type="library" name="slf4j-api-2.0.9" level="project" /> |
||||
|
<orderEntry type="library" name="fastjson2-2.0.32" level="project" /> |
||||
|
</component> |
||||
|
</module> |
||||
@ -0,0 +1,150 @@ |
|||||
|
# Java爬虫综合项目(CLI + MVC + Command + 策略模式) |
||||
|
## 一、项目简介 |
||||
|
本项目实现了一个命令行菜单式爬虫,支持爬取豆瓣电影、王者荣耀英雄、中国天气网天气等数据。采用MVC、命令模式、策略模式、多层结构,集成日志体系与异常分包,代码结构规范,易于扩展和维护。 |
||||
|
|
||||
|
## 二、项目结构 |
||||
|
```` |
||||
|
CrawlerMain2 |
||||
|
├── .idea # IDEA 配置文件 |
||||
|
├── out # 编译输出目录 |
||||
|
└── src |
||||
|
├── command # 命令模式模块 |
||||
|
│ ├── AnalyzeCommand |
||||
|
│ ├── CommandInvoker |
||||
|
│ ├── CrawlCommand |
||||
|
│ ├── HeroCrawlCommand |
||||
|
│ ├── MovieCrawlCommand |
||||
|
│ ├── WeatherAnalyzeCommand |
||||
|
│ └── WeatherCrawlCommand |
||||
|
├── controller # 控制器与上下文 |
||||
|
│ └── CrawlerContext |
||||
|
├── crawler # 策略模式爬虫实现 |
||||
|
│ ├── BaseCrawler |
||||
|
│ ├── Crawler |
||||
|
│ ├── HeroCrawler |
||||
|
│ ├── MovieCrawler |
||||
|
│ └── WeatherCrawler |
||||
|
├── exception # 自定义异常体系 |
||||
|
├── model # 数据实体类 |
||||
|
│ ├── Hero |
||||
|
│ ├── Movie |
||||
|
│ └── Weather |
||||
|
├── util # 工具类 |
||||
|
│ └── DataUtil # JSON 导入导出、增量去重、文件 IO |
||||
|
├── view # 视图层(CLI 菜单交互) |
||||
|
│ └── CrawlerView |
||||
|
├── CrawlerMain # 程序入口 |
||||
|
└── logback.xml # 日志配置 |
||||
|
├── .gitignore # Git忽略文件配置 |
||||
|
└── CrawlerMain2.iml # IDEA模块配置 |
||||
|
```` |
||||
|
--- |
||||
|
## 三、功能介绍 |
||||
|
|
||||
|
- 命令行菜单,操作简单,支持多种数据源抓取 |
||||
|
- 豆瓣电影、王者荣耀英雄、中国天气网天气数据采集 |
||||
|
- 本地数据存储和分析统计 |
||||
|
- MVC分层、命令模式、策略模式设计,扩展方便 |
||||
|
- try-with-resources安全IO,保证资源释放与数据安全 |
||||
|
- 支持JSON序列化导出(movie.json等)与数据备份 |
||||
|
- 历史数据导入恢复功能,支持断点/回溯操作 |
||||
|
- 增量抓取机制,自动去重,避免重复采集 |
||||
|
- 日志体系与异常处理,项目健壮可追踪 |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
## 四、依赖说明 |
||||
|
|
||||
|
- **JDK 8及以上版本** |
||||
|
- **jsoup**(网页解析) |
||||
|
- **slf4j**(日志接口) |
||||
|
- **logback**(日志实现) |
||||
|
- **fastjson2**(JSON处理框架,完成数据序列化/反序列化) |
||||
|
- IO相关类(java.io.File、FileReader、IOException等)实现文件操作 |
||||
|
- 集合类(Set、HashSet、Stream),用于增量抓取与数据去重 |
||||
|
|
||||
|
> 推荐使用 Maven(pom.xml)管理依赖,添加如下: |
||||
|
|
||||
|
```xml |
||||
|
<dependency> |
||||
|
<groupId>org.jsoup</groupId> |
||||
|
<artifactId>jsoup</artifactId> |
||||
|
<version>1.15.3</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>org.slf4j</groupId> |
||||
|
<artifactId>slf4j-api</artifactId> |
||||
|
<version>1.7.36</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>ch.qos.logback</groupId> |
||||
|
<artifactId>logback-classic</artifactId> |
||||
|
<version>1.2.11</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>com.alibaba.fastjson2</groupId> |
||||
|
<artifactId>fastjson2</artifactId> |
||||
|
<version>2.0.47</version> |
||||
|
</dependency> |
||||
|
``` |
||||
|
如果不用 Maven,可手动下载相关 jar 包,并在 IDEA 的 Libraries 中添加。 |
||||
|
|
||||
|
--- |
||||
|
## 五、运行说明(IntelliJ IDEA) |
||||
|
1. 用 IDEA 打开项目根目录(包含 src/,logback.xml 等)。 |
||||
|
2. 配置 jsoup、slf4j、logback、fastjson2 依赖(建议用 Maven),或手动添加。 |
||||
|
3. 确认DataUtil.java里的数据存储路径有权限。如必要,修改为本地可用的目录。 |
||||
|
4. 右键CrawlerMain.java,选择“Run”,即可启动项目。 |
||||
|
5. 启动后按照命令行菜单提示输入数字操作,体验各类爬取、分析、导出、恢复功能: |
||||
|
```` |
||||
|
1 爬取豆瓣电影 |
||||
|
2 爬取王者荣耀英雄 |
||||
|
3 爬取全国天气 |
||||
|
4 电影、英雄数据分析(仅统计,不存储) |
||||
|
5 天气数据分析 |
||||
|
6 导入历史数据 |
||||
|
0 退出程序 |
||||
|
```` |
||||
|
程序在抓取时自动生成 JSON 文件 (movie.json),支持导入历史数据并增量去重。 |
||||
|
|
||||
|
--- |
||||
|
## 六、主要设计与包说明 |
||||
|
| 包/类 | 功能描述 | |
||||
|
| ---------------- | ------------------------------------------ | |
||||
|
| exception | 自定义异常体系 | |
||||
|
| command | 命令模式相关类(命令封装、调度) | |
||||
|
| controller | 控制器上下文(业务流转管理) | |
||||
|
| strategy | 策略接口与具体爬虫实现 | |
||||
|
| model | 数据实体类(电影、英雄、天气) | |
||||
|
| util/DataUtil.java | 数据存储、历史恢复、JSON导入导出与增量去重 | |
||||
|
| view | CLI菜单视图,交互展示 | |
||||
|
--- |
||||
|
## 七、核心工具/新功能说明 |
||||
|
- DataUtil 工具类 |
||||
|
- 新增方法封装 JSON 序列化与反序列化(依赖 fastjson2 的 JSON 工具类) |
||||
|
- 支持文件安全读写(采用 try‑with‑resources 结构,自动释放 IO 资源) |
||||
|
- 增量抓取与去重,采用 Set/HashSet 结构自动过滤已采集的对象 |
||||
|
- 历史数据校验与导入,支持断点恢复(即重新导入 movie.json,恢复进度) |
||||
|
|
||||
|
--- |
||||
|
## 八、常见问题 |
||||
|
1. 存储权限报错:请修改 DataUtil 的数据路径为本机有权限的盘符或目录 |
||||
|
2. 依赖缺失或报错:确保 jsoup、slf4j、logback、fastjson2 已配置到项目 |
||||
|
3. JSON导入导出异常:检查 movie.json 是否存在且格式正确,或依赖版本是否兼容 |
||||
|
4. 重复抓取问题:增量去重会自动过滤相同对象,手动修改数据时注意唯一性 |
||||
|
5. 出现 IOException:请检查文件路径、权限是否正常 |
||||
|
|
||||
|
--- |
||||
|
## 九、扩展与二次开发说明 |
||||
|
- 新增网站爬虫或数据实体: |
||||
|
- 新建 strategy 类(如 NewSiteCrawler.java) |
||||
|
- 新建 command 类,添加进菜单与调度 |
||||
|
- 扩展 model 数据实体 |
||||
|
- 导出和导入功能可支持更多格式,只需扩展 DataUtil 工具类即可 |
||||
|
|
||||
|
--- |
||||
|
## 十、作者信息 |
||||
|
- 姓名:郑诗艺 |
||||
|
- 学号:202401070210 |
||||
|
- 班级:大数据管理与应用2402班 |
||||
|
- 日期:2026.5.24 |
||||
@ -0,0 +1,100 @@ |
|||||
|
@startuml 爬虫项目类图_完整竖版一张图 |
||||
|
' 全局设置:垂直布局、紧凑、单张大图、禁止分页 |
||||
|
skinparam layout topdown |
||||
|
skinparam ranksep 55 |
||||
|
skinparam nodesep 18 |
||||
|
skinparam dpi 300 |
||||
|
skinparam page { |
||||
|
width 100% |
||||
|
height 100% |
||||
|
} |
||||
|
skinparam class { |
||||
|
BackgroundColor #f0f5ff |
||||
|
BorderColor #222222 |
||||
|
ArrowColor #222222 |
||||
|
} |
||||
|
skinparam package { |
||||
|
BackgroundColor #eef4ff |
||||
|
BorderColor #222222 |
||||
|
} |
||||
|
|
||||
|
' 从上到下依次排列所有包,不拆分 |
||||
|
package model { |
||||
|
class Hero { |
||||
|
-name: String |
||||
|
-type: String |
||||
|
} |
||||
|
class Movie { |
||||
|
-title: String |
||||
|
-rate: Double |
||||
|
} |
||||
|
class Weather { |
||||
|
-city: String |
||||
|
-temp: String |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
package crawler { |
||||
|
interface Crawler { |
||||
|
+crawl(): List<?> |
||||
|
} |
||||
|
abstract class BaseCrawler implements Crawler { |
||||
|
+crawl(): List<?> |
||||
|
} |
||||
|
class MovieCrawler extends BaseCrawler |
||||
|
class HeroCrawler extends BaseCrawler |
||||
|
class WeatherCrawler extends BaseCrawler |
||||
|
} |
||||
|
|
||||
|
package exception { |
||||
|
class CrawlerException { |
||||
|
+CrawlerException(msg: String) |
||||
|
} |
||||
|
class NetworkException extends CrawlerException |
||||
|
class ParseException extends CrawlerException |
||||
|
} |
||||
|
|
||||
|
package command { |
||||
|
interface CrawlCommand { |
||||
|
+execute(): void |
||||
|
} |
||||
|
class CommandInvoker { |
||||
|
+invoke(): void |
||||
|
} |
||||
|
class MovieCrawlCommand implements CrawlCommand |
||||
|
class HeroCrawlCommand implements CrawlCommand |
||||
|
class WeatherCrawlCommand implements CrawlCommand |
||||
|
class AnalyzeCommand implements CrawlCommand |
||||
|
class WeatherAnalyzeCommand implements CrawlCommand |
||||
|
CommandInvoker -[dashed]-> CrawlCommand : 调度 |
||||
|
} |
||||
|
|
||||
|
package controller { |
||||
|
class CrawlerContext |
||||
|
} |
||||
|
|
||||
|
package view { |
||||
|
class CrawlerView { |
||||
|
+showMenu(): void |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
package util { |
||||
|
class DataUtil { |
||||
|
+exportJson(): void |
||||
|
+importJson(): List<?> |
||||
|
+removeDuplicate(): List<?> |
||||
|
+analyzeOnly(): void |
||||
|
+analyzeWeatherOnly(): void |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
class CrawlerMain |
||||
|
|
||||
|
' 依赖关系 |
||||
|
CrawlerMain --> CrawlerContext |
||||
|
CrawlerMain --> CrawlerView |
||||
|
CrawlerMain --> DataUtil |
||||
|
DataUtil -[dashed]-> ParseException : throws |
||||
|
|
||||
|
@enduml |
||||
@ -0,0 +1,101 @@ |
|||||
|
import command.AnalyzeCommand; |
||||
|
import command.HeroCrawlCommand; |
||||
|
import command.MovieCrawlCommand; |
||||
|
import command.WeatherAnalyzeCommand; |
||||
|
import command.WeatherCrawlCommand; |
||||
|
import command.CommandInvoker; |
||||
|
import controller.CrawlerContext; |
||||
|
import exception.CrawlerException; |
||||
|
import exception.NetworkException; |
||||
|
import exception.ParseException; |
||||
|
import model.Hero; |
||||
|
import model.Movie; |
||||
|
import model.Weather; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
import util.DataUtil; |
||||
|
import view.CrawlerView; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.List; |
||||
|
|
||||
|
// ===================== 10. 主程序入口 =====================
|
||||
|
public class CrawlerMain { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawlerMain.class); |
||||
|
|
||||
|
public static void main(String[] args) { |
||||
|
logger.info("===== 爬虫程序启动(CLI+MVC+Command+策略模式) ====="); |
||||
|
CrawlerView view = new CrawlerView(); |
||||
|
CrawlerContext context = new CrawlerContext(); |
||||
|
CommandInvoker invoker = new CommandInvoker(); |
||||
|
|
||||
|
MovieCrawlCommand movieCmd = new MovieCrawlCommand(context); |
||||
|
HeroCrawlCommand heroCmd = new HeroCrawlCommand(context); |
||||
|
WeatherCrawlCommand weatherCmd = new WeatherCrawlCommand(context); |
||||
|
AnalyzeCommand analyzeCmd = new AnalyzeCommand(movieCmd, heroCmd); |
||||
|
WeatherAnalyzeCommand weatherAnalyzeCmd = new WeatherAnalyzeCommand(weatherCmd); |
||||
|
|
||||
|
DataUtil.initFolder(); |
||||
|
|
||||
|
while (true) { |
||||
|
try { |
||||
|
view.showMenu(); |
||||
|
int op = view.getInput(); |
||||
|
switch (op) { |
||||
|
case 1: |
||||
|
invoker.runCommand(movieCmd); |
||||
|
view.showMsg("电影爬取完成"); |
||||
|
break; |
||||
|
case 2: |
||||
|
invoker.runCommand(heroCmd); |
||||
|
view.showMsg("英雄爬取完成"); |
||||
|
break; |
||||
|
case 3: |
||||
|
invoker.runCommand(weatherCmd); |
||||
|
view.showMsg("天气爬取完成"); |
||||
|
break; |
||||
|
case 4: |
||||
|
analyzeCmd.execute(); |
||||
|
break; |
||||
|
case 5: |
||||
|
weatherAnalyzeCmd.execute(); |
||||
|
break; |
||||
|
// 第6项:导入历史数据(无fileName:,无报错)
|
||||
|
case 6: |
||||
|
try { |
||||
|
List<Movie> movieList = DataUtil.importJson("movie.json", Movie.class); |
||||
|
List<Hero> heroList = DataUtil.importJson("hero.json", Hero.class); |
||||
|
List<Weather> weatherList = DataUtil.importJson("weather.json", Weather.class); |
||||
|
view.showMsg("✅ 历史数据导入成功!"); |
||||
|
view.showMsg("电影:" + movieList.size() + " 条"); |
||||
|
view.showMsg("英雄:" + heroList.size() + " 条"); |
||||
|
view.showMsg("天气:" + weatherList.size() + " 条"); |
||||
|
} catch (IOException e) { |
||||
|
view.showMsg("导入失败:" + e.getMessage()); |
||||
|
} |
||||
|
break; |
||||
|
case 0: |
||||
|
view.showMsg("程序退出"); |
||||
|
System.exit(0); |
||||
|
break; |
||||
|
default: |
||||
|
view.showMsg("指令错误,请重新输入"); |
||||
|
} |
||||
|
} catch (NumberFormatException e) { |
||||
|
view.showMsg("请输入数字!"); |
||||
|
} catch (NetworkException e) { |
||||
|
logger.error("网络异常:", e); |
||||
|
view.showMsg("网络异常:" + e.getMessage()); |
||||
|
} catch (ParseException e) { |
||||
|
logger.error("解析异常:", e); |
||||
|
view.showMsg("解析异常:" + e.getMessage()); |
||||
|
} catch (CrawlerException e) { |
||||
|
logger.error("爬虫异常:", e); |
||||
|
view.showMsg("爬虫异常:" + e.getMessage()); |
||||
|
}catch (IOException e) { |
||||
|
logger.error("IO异常:", e); |
||||
|
view.showMsg("IO异常:" + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,22 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
import util.DataUtil; |
||||
|
|
||||
|
public class AnalyzeCommand implements CrawlCommand { |
||||
|
private final MovieCrawlCommand movieCmd; |
||||
|
private final HeroCrawlCommand heroCmd; |
||||
|
private static final Logger logger = LoggerFactory.getLogger(AnalyzeCommand.class); |
||||
|
|
||||
|
public AnalyzeCommand(MovieCrawlCommand movieCmd, HeroCrawlCommand heroCmd) { |
||||
|
this.movieCmd = movieCmd; |
||||
|
this.heroCmd = heroCmd; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() { |
||||
|
DataUtil.analyzeOnly(movieCmd.getResult(), heroCmd.getResult()); |
||||
|
logger.info("电影&英雄数据分析命令执行完成(仅统计)"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,13 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import exception.CrawlerException; |
||||
|
import exception.NetworkException; |
||||
|
import exception.ParseException; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
|
||||
|
public class CommandInvoker { |
||||
|
public void runCommand(CrawlCommand cmd) throws ParseException, NetworkException, CrawlerException, IOException { |
||||
|
cmd.execute(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,12 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import exception.CrawlerException; |
||||
|
import exception.NetworkException; |
||||
|
import exception.ParseException; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
|
||||
|
// ===================== 8. Command模式 =====================
|
||||
|
public interface CrawlCommand { |
||||
|
void execute() throws ParseException, NetworkException, CrawlerException, IOException; |
||||
|
} |
||||
@ -0,0 +1,54 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import controller.CrawlerContext; |
||||
|
import crawler.HeroCrawler; |
||||
|
import exception.CrawlerException; |
||||
|
import exception.NetworkException; |
||||
|
import exception.ParseException; |
||||
|
import model.Hero; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
import util.DataUtil; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.List; |
||||
|
import java.util.Set; |
||||
|
import java.util.stream.Collectors; |
||||
|
|
||||
|
public class HeroCrawlCommand implements CrawlCommand { |
||||
|
private final CrawlerContext context; |
||||
|
private List<Hero> heroList; |
||||
|
private List<Hero> oldHeroList; |
||||
|
|
||||
|
private static final Logger logger = LoggerFactory.getLogger(HeroCrawlCommand.class); |
||||
|
|
||||
|
public HeroCrawlCommand(CrawlerContext context) { |
||||
|
this.context = context; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() throws ParseException, NetworkException, CrawlerException, IOException { |
||||
|
oldHeroList = DataUtil.importJson("hero.json", Hero.class); |
||||
|
logger.info("导入历史英雄数据:{}条", oldHeroList.size()); |
||||
|
|
||||
|
// 第四点:增量抓取,过滤重复英雄
|
||||
|
Set<String> existNames = DataUtil.getExistHeroNames("hero.json"); |
||||
|
|
||||
|
context.setCrawlerStrategy(new HeroCrawler()); |
||||
|
heroList = (List<Hero>) context.executeCrawl(); |
||||
|
|
||||
|
heroList = heroList.stream() |
||||
|
.filter(hero -> !existNames.contains(hero.getName())) |
||||
|
.collect(Collectors.toList()); |
||||
|
|
||||
|
heroList.addAll(oldHeroList); |
||||
|
|
||||
|
DataUtil.addAll("英雄数据.txt", heroList); |
||||
|
DataUtil.exportJson("hero.json", heroList); |
||||
|
logger.info("英雄爬取完成,本次新增:{}条", heroList.size() - oldHeroList.size()); |
||||
|
} |
||||
|
|
||||
|
public List<Hero> getResult() { |
||||
|
return heroList; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,57 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import controller.CrawlerContext; |
||||
|
import crawler.MovieCrawler; |
||||
|
import exception.CrawlerException; |
||||
|
import exception.NetworkException; |
||||
|
import exception.ParseException; |
||||
|
import model.Movie; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
import util.DataUtil; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.List; |
||||
|
import java.util.Set; |
||||
|
import java.util.stream.Collectors; |
||||
|
|
||||
|
public class MovieCrawlCommand implements CrawlCommand { |
||||
|
private final CrawlerContext context; |
||||
|
private List<Movie> movieList; |
||||
|
private List<Movie> oldMovieList; |
||||
|
|
||||
|
private static final Logger logger = LoggerFactory.getLogger(MovieCrawlCommand.class); |
||||
|
|
||||
|
public MovieCrawlCommand(CrawlerContext context) { |
||||
|
this.context = context; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() throws ParseException, NetworkException, CrawlerException, IOException { |
||||
|
// 第三点:导入历史数据
|
||||
|
oldMovieList = DataUtil.importJson("movie.json", Movie.class); |
||||
|
logger.info("导入历史电影数据:{}条", oldMovieList.size()); |
||||
|
|
||||
|
// 第四点:获取已存在的电影,增量抓取,避免重复
|
||||
|
Set<String> existTitles = DataUtil.getExistMovieTitles("movie.json"); |
||||
|
|
||||
|
context.setCrawlerStrategy(new MovieCrawler()); |
||||
|
movieList = (List<Movie>) context.executeCrawl(); |
||||
|
|
||||
|
// 过滤掉已经存在的电影,只保留新数据
|
||||
|
movieList = movieList.stream() |
||||
|
.filter(movie -> !existTitles.contains(movie.getTitle())) |
||||
|
.collect(Collectors.toList()); |
||||
|
|
||||
|
// 合并:新数据 + 历史数据
|
||||
|
movieList.addAll(oldMovieList); |
||||
|
|
||||
|
DataUtil.addAll("电影数据.txt", movieList); |
||||
|
DataUtil.exportJson("movie.json", movieList); |
||||
|
logger.info("电影爬取完成,本次新增:{}条", movieList.size() - oldMovieList.size()); |
||||
|
} |
||||
|
|
||||
|
public List<Movie> getResult() { |
||||
|
return movieList; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,20 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
import util.DataUtil; |
||||
|
|
||||
|
public class WeatherAnalyzeCommand implements CrawlCommand { |
||||
|
private final WeatherCrawlCommand weatherCmd; |
||||
|
private static final Logger logger = LoggerFactory.getLogger(WeatherAnalyzeCommand.class); |
||||
|
|
||||
|
public WeatherAnalyzeCommand(WeatherCrawlCommand weatherCmd) { |
||||
|
this.weatherCmd = weatherCmd; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() { |
||||
|
DataUtil.analyzeWeatherOnly(weatherCmd.getResult()); |
||||
|
logger.info("天气数据分析命令执行完成(仅统计)"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,54 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import controller.CrawlerContext; |
||||
|
import crawler.WeatherCrawler; |
||||
|
import exception.CrawlerException; |
||||
|
import exception.NetworkException; |
||||
|
import exception.ParseException; |
||||
|
import model.Weather; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
import util.DataUtil; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.List; |
||||
|
import java.util.Set; |
||||
|
import java.util.stream.Collectors; |
||||
|
|
||||
|
public class WeatherCrawlCommand implements CrawlCommand { |
||||
|
private final CrawlerContext context; |
||||
|
private List<Weather> weatherList; |
||||
|
private List<Weather> oldWeatherList; |
||||
|
|
||||
|
private static final Logger logger = LoggerFactory.getLogger(WeatherCrawlCommand.class); |
||||
|
|
||||
|
public WeatherCrawlCommand(CrawlerContext context) { |
||||
|
this.context = context; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() throws ParseException, NetworkException, CrawlerException, IOException { |
||||
|
oldWeatherList = DataUtil.importJson("weather.json", Weather.class); |
||||
|
logger.info("导入历史天气数据:{}条", oldWeatherList.size()); |
||||
|
|
||||
|
// 第四点:增量抓取,过滤重复城市
|
||||
|
Set<String> existCities = DataUtil.getExistWeatherCities("weather.json"); |
||||
|
|
||||
|
context.setCrawlerStrategy(new WeatherCrawler()); |
||||
|
weatherList = (List<Weather>) context.executeCrawl(); |
||||
|
|
||||
|
weatherList = weatherList.stream() |
||||
|
.filter(weather -> !existCities.contains(weather.getCity())) |
||||
|
.collect(Collectors.toList()); |
||||
|
|
||||
|
weatherList.addAll(oldWeatherList); |
||||
|
|
||||
|
DataUtil.addAll("天气数据.txt", weatherList); |
||||
|
DataUtil.exportJson("weather.json", weatherList); |
||||
|
logger.info("天气爬取完成,本次新增:{}条", weatherList.size() - oldWeatherList.size()); |
||||
|
} |
||||
|
|
||||
|
public List<Weather> getResult() { |
||||
|
return weatherList; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,28 @@ |
|||||
|
package controller; |
||||
|
|
||||
|
import crawler.Crawler; |
||||
|
import exception.CrawlerException; |
||||
|
import exception.NetworkException; |
||||
|
import exception.ParseException; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
// ===================== 6. 策略上下文 =====================
|
||||
|
public class CrawlerContext { |
||||
|
private Crawler crawlerStrategy; |
||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawlerContext.class); |
||||
|
|
||||
|
public void setCrawlerStrategy(Crawler crawlerStrategy) { |
||||
|
this.crawlerStrategy = crawlerStrategy; |
||||
|
} |
||||
|
|
||||
|
public List<?> executeCrawl() throws ParseException, NetworkException, CrawlerException { |
||||
|
if (crawlerStrategy == null) { |
||||
|
logger.error("未设置爬取策略"); |
||||
|
throw new CrawlerException("爬取策略未配置"); |
||||
|
} |
||||
|
return crawlerStrategy.startCrawl(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,64 @@ |
|||||
|
package crawler; |
||||
|
|
||||
|
import exception.CrawlerException; |
||||
|
import exception.NetworkException; |
||||
|
import exception.ParseException; |
||||
|
import org.jsoup.Connection; |
||||
|
import org.jsoup.HttpStatusException; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public abstract class BaseCrawler<T> implements Crawler { |
||||
|
protected final String baseUrl; |
||||
|
private static final Logger logger = LoggerFactory.getLogger(BaseCrawler.class); |
||||
|
|
||||
|
public BaseCrawler(String baseUrl) { |
||||
|
this.baseUrl = baseUrl; |
||||
|
} |
||||
|
|
||||
|
// 定义和子类匹配的抽象方法签名
|
||||
|
public abstract List<T> startCrawl() throws ParseException, NetworkException, CrawlerException; |
||||
|
|
||||
|
// 统一请求页面方法(加固防拦截)
|
||||
|
public Document getPage(String url) throws NetworkException { |
||||
|
String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"; |
||||
|
String cookie = "ptui_loginuin=; pgv_pvid=123456; RK=randomtest; _qpsvr_localtest=; uin=;"; |
||||
|
|
||||
|
int retry = 3; |
||||
|
while (retry > 0) { |
||||
|
try { |
||||
|
Connection conn = Jsoup.connect(url) |
||||
|
.userAgent(userAgent) |
||||
|
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") |
||||
|
.header("Accept-Language", "zh-CN,zh;q=0.9") |
||||
|
.header("Referer", url.contains("douban") ? "https://movie.douban.com/" : "https://pvp.qq.com/") |
||||
|
.header("Connection", "keep-alive") |
||||
|
.cookie("Cookie", cookie) |
||||
|
.timeout(15000) |
||||
|
.followRedirects(true); |
||||
|
|
||||
|
Document doc = conn.get(); |
||||
|
logger.info("第{}次请求页面:{}", 4 - retry, url); |
||||
|
return doc; |
||||
|
} catch (HttpStatusException e) { |
||||
|
retry--; |
||||
|
logger.error("请求页面失败,剩余重试次数:{}", retry, e); |
||||
|
if (retry <= 0) { |
||||
|
throw new NetworkException("页面请求彻底失败:" + url, e); |
||||
|
} |
||||
|
try { |
||||
|
Thread.sleep(2000); |
||||
|
} catch (InterruptedException ie) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
throw new NetworkException("页面请求异常:" + url, e); |
||||
|
} |
||||
|
} |
||||
|
throw new NetworkException("请求超时:" + url); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,12 @@ |
|||||
|
package crawler; |
||||
|
|
||||
|
import exception.CrawlerException; |
||||
|
import exception.NetworkException; |
||||
|
import exception.ParseException; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
// ===================== 2. 策略模式:抽象策略接口 =====================
|
||||
|
public interface Crawler { |
||||
|
List<?> startCrawl() throws ParseException, NetworkException, CrawlerException; |
||||
|
} |
||||
@ -0,0 +1,43 @@ |
|||||
|
package crawler; |
||||
|
|
||||
|
import exception.CrawlerException; |
||||
|
import exception.NetworkException; |
||||
|
import exception.ParseException; |
||||
|
import model.Hero; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class HeroCrawler extends BaseCrawler { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(HeroCrawler.class); |
||||
|
|
||||
|
public HeroCrawler() { |
||||
|
super("https://pvp.qq.com/web201605/herolist.shtml"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Hero> startCrawl() throws ParseException, NetworkException, CrawlerException { |
||||
|
List<Hero> list = new ArrayList<>(); |
||||
|
logger.info("开始爬取王者荣耀英雄数据"); |
||||
|
try { |
||||
|
Document doc = getPage(baseUrl); |
||||
|
Elements heros = doc.select("ul.herolist li a"); |
||||
|
if (heros.isEmpty()) throw new ParseException("页面解析失败:未找到英雄列表项"); |
||||
|
for (Element h : heros) { |
||||
|
String name = h.text().trim(); |
||||
|
if (!name.isEmpty()) list.add(new Hero(name)); |
||||
|
} |
||||
|
logger.info("英雄爬取完成,共{}条数据", list.size()); |
||||
|
} catch (NetworkException e) { |
||||
|
throw e; |
||||
|
} catch (Exception e) { |
||||
|
throw new ParseException("英雄数据解析异常", e); |
||||
|
} |
||||
|
return list; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,56 @@ |
|||||
|
package crawler; |
||||
|
|
||||
|
import exception.CrawlerException; |
||||
|
import exception.NetworkException; |
||||
|
import exception.ParseException; |
||||
|
import model.Movie; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
// ===================== 5. 具体策略爬虫类 =====================
|
||||
|
public class MovieCrawler extends BaseCrawler { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(MovieCrawler.class); |
||||
|
|
||||
|
public MovieCrawler() { |
||||
|
super("https://movie.douban.com/top250"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> startCrawl() throws ParseException, NetworkException, CrawlerException { |
||||
|
List<Movie> list = new ArrayList<>(); |
||||
|
logger.info("开始爬取豆瓣电影Top250"); |
||||
|
try { |
||||
|
for (int i = 0; i < 250; i += 25) { |
||||
|
Document doc = getPage(baseUrl + "?start=" + i); |
||||
|
Elements items = doc.select(".item"); |
||||
|
if (items.isEmpty()) throw new ParseException("页面解析失败:未找到电影列表项"); |
||||
|
for (Element e : items) { |
||||
|
Element titleEle = e.select(".title").first(); |
||||
|
Element ratingEle = e.select(".rating_num").first(); |
||||
|
if (titleEle == null || ratingEle == null) { |
||||
|
logger.warn("单条电影数据解析失败,跳过"); |
||||
|
continue; |
||||
|
} |
||||
|
String title = titleEle.text().split("/")[0].trim(); |
||||
|
String rating = ratingEle.text(); |
||||
|
list.add(new Movie(title, rating)); |
||||
|
} |
||||
|
Thread.sleep(1000); |
||||
|
} |
||||
|
logger.info("豆瓣电影爬取完成,共{}条数据", list.size()); |
||||
|
} catch (NetworkException e) { |
||||
|
throw e; |
||||
|
} catch (InterruptedException e) { |
||||
|
throw new CrawlerException("爬取被中断", e); |
||||
|
} catch (Exception e) { |
||||
|
throw new ParseException("电影数据解析异常", e); |
||||
|
} |
||||
|
return list; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,74 @@ |
|||||
|
package crawler; |
||||
|
|
||||
|
import exception.CrawlerException; |
||||
|
import exception.NetworkException; |
||||
|
import exception.ParseException; |
||||
|
import model.Weather; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class WeatherCrawler extends BaseCrawler { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(WeatherCrawler.class); |
||||
|
private static final String[][] cities = { |
||||
|
{"北京", "北京", "101010100"}, {"上海", "上海", "101020100"}, {"天津", "天津", "101030100"}, {"重庆", "重庆", "101040100"}, |
||||
|
{"河北", "石家庄", "101090101"}, {"山西", "太原", "101100101"}, {"辽宁", "沈阳", "101070101"}, {"吉林", "长春", "101060101"}, |
||||
|
{"黑龙江", "哈尔滨", "101050101"}, {"江苏", "南京", "101190101"}, {"浙江", "杭州", "101210101"}, {"安徽", "合肥", "101220101"}, |
||||
|
{"福建", "福州", "101230101"}, {"江西", "南昌", "101240101"}, {"山东", "济南", "101120101"}, {"河南", "郑州", "101180101"}, |
||||
|
{"湖北", "武汉", "101200101"}, {"湖南", "长沙", "101250101"}, {"广东", "广州", "101280101"}, {"海南", "海口", "101310101"}, |
||||
|
{"四川", "成都", "101270101"}, {"贵州", "贵阳", "101260101"}, {"云南", "昆明", "101290101"}, {"陕西", "西安", "101110101"}, |
||||
|
{"甘肃", "兰州", "101160101"}, {"青海", "西宁", "101150101"}, {"内蒙古", "呼和浩特", "101080101"}, {"广西", "南宁", "101300101"}, |
||||
|
{"西藏", "拉萨", "101140101"}, {"宁夏", "银川", "101170101"}, {"新疆", "乌鲁木齐", "101130101"}, |
||||
|
{"香港", "香港", "101320101"}, {"澳门", "澳门", "101330101"}, {"台湾", "台北", "101340101"} |
||||
|
}; |
||||
|
|
||||
|
public WeatherCrawler() { |
||||
|
super("https://www.weather.com.cn/weather/"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Weather> startCrawl() throws ParseException, NetworkException, CrawlerException { |
||||
|
List<Weather> list = new ArrayList<>(); |
||||
|
logger.info("开始爬取全国城市实时温度数据"); |
||||
|
try { |
||||
|
for (String[] city : cities) { |
||||
|
String province = city[0]; |
||||
|
String cityName = city[1]; |
||||
|
String code = city[2]; |
||||
|
Document doc = getPage(baseUrl + code + ".shtml"); |
||||
|
|
||||
|
// 取7天预报【第1个li】=今日实时温度,全页面通用、绝不空指针
|
||||
|
Element today = doc.select("ul.t li").first(); |
||||
|
if (today == null) throw new ParseException("实时天气解析失败:" + cityName); |
||||
|
|
||||
|
String tempStr = today.select(".tem").text(); |
||||
|
String weaStr = today.select(".wea").text(); |
||||
|
|
||||
|
// 拆分:最高温/最低温,取**最高温作为实时温度**
|
||||
|
String realTemp; |
||||
|
if(tempStr.contains("/")){ |
||||
|
realTemp = tempStr.split("/")[0]; |
||||
|
}else if(tempStr.contains("~")){ |
||||
|
realTemp = tempStr.split("~")[0]; |
||||
|
}else{ |
||||
|
realTemp = tempStr; |
||||
|
} |
||||
|
|
||||
|
list.add(new Weather(province, cityName, weaStr, realTemp)); |
||||
|
Thread.sleep(500); |
||||
|
} |
||||
|
logger.info("实时天气爬取完成,共{}条数据", list.size()); |
||||
|
} catch (NetworkException e) { |
||||
|
throw e; |
||||
|
} catch (InterruptedException e) { |
||||
|
throw new CrawlerException("爬取线程被中断", e); |
||||
|
} catch (Exception e) { |
||||
|
throw new ParseException("天气数据解析异常", e); |
||||
|
} |
||||
|
return list; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,10 @@ |
|||||
|
package exception; |
||||
|
|
||||
|
public class CrawlerException extends Exception { |
||||
|
public CrawlerException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
public CrawlerException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,10 @@ |
|||||
|
package exception; |
||||
|
|
||||
|
public class NetworkException extends CrawlerException { |
||||
|
public NetworkException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
public NetworkException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,10 @@ |
|||||
|
package exception; |
||||
|
|
||||
|
public class ParseException extends CrawlerException { |
||||
|
public ParseException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
public ParseException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,13 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<configuration> |
||||
|
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> |
||||
|
<encoder> |
||||
|
<!-- 这里就是你要的:时间+日志级别+类名+信息 --> |
||||
|
<pattern>%d{HH:mm:ss.SSS} [%level] %logger - %msg%n</pattern> |
||||
|
<charset>UTF-8</charset> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
<root level="INFO"> |
||||
|
<appender-ref ref="CONSOLE"/> |
||||
|
</root> |
||||
|
</configuration> |
||||
@ -0,0 +1,18 @@ |
|||||
|
package model; |
||||
|
|
||||
|
public class Hero { |
||||
|
private final String name; |
||||
|
|
||||
|
public Hero(String name) { |
||||
|
this.name = name; |
||||
|
} |
||||
|
|
||||
|
public String getName() { |
||||
|
return name; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "英雄:" + name; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,33 @@ |
|||||
|
package model; |
||||
|
|
||||
|
// ===================== 4. MVC‑Model:实体类 =====================
|
||||
|
public class Movie { |
||||
|
private final String title; |
||||
|
private final String rating; |
||||
|
|
||||
|
public Movie(String title, String rating) { |
||||
|
this.title = title; |
||||
|
this.rating = rating; |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public double getRatingDouble() { |
||||
|
try { |
||||
|
return Double.parseDouble(rating); |
||||
|
} catch (NumberFormatException e) { |
||||
|
throw new IllegalArgumentException("评分格式错误:" + rating, e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public String getRating() { |
||||
|
return rating; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "电影:《" + title + "》 | 评分:" + rating; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,34 @@ |
|||||
|
package model; |
||||
|
|
||||
|
public class Weather { |
||||
|
private final String province; |
||||
|
private final String city; |
||||
|
private final String condition; |
||||
|
private final String temperature; // 实时温度
|
||||
|
|
||||
|
public Weather(String province, String city, String condition, String temperature) { |
||||
|
this.province = province; |
||||
|
this.city = city; |
||||
|
this.condition = condition; |
||||
|
this.temperature = temperature; |
||||
|
} |
||||
|
|
||||
|
public String getProvince() { return province; } |
||||
|
public String getCity() { return city; } |
||||
|
public String getCondition() { return condition; } |
||||
|
public String getTemperature() { return temperature; } |
||||
|
|
||||
|
// 用于排序,提取温度数字
|
||||
|
public int getTempNum() { |
||||
|
try { |
||||
|
return Integer.parseInt(temperature.replaceAll("[^0-9]", "")); |
||||
|
} catch (Exception e) { |
||||
|
return Integer.MIN_VALUE; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "省份:" + province + " | 城市:" + city + " | 天气:" + condition + " | 实时温度:" + temperature; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,195 @@ |
|||||
|
package util; |
||||
|
|
||||
|
import com.alibaba.fastjson2.JSON; |
||||
|
import com.alibaba.fastjson2.JSONWriter; |
||||
|
import exception.ParseException; |
||||
|
import model.Hero; |
||||
|
import model.Movie; |
||||
|
import model.Weather; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.File; |
||||
|
import java.io.FileReader; |
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.util.Collections; |
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
import java.util.stream.Collectors; |
||||
|
import java.util.HashSet; |
||||
|
import java.util.Set; |
||||
|
|
||||
|
// ===================== 7. MVC‑Repository:数据仓库 =====================
|
||||
|
public final class DataUtil { |
||||
|
private static final String PATH = "D:\\Java爬虫\\"; |
||||
|
private static final Logger logger = LoggerFactory.getLogger(DataUtil.class); |
||||
|
|
||||
|
private DataUtil() { |
||||
|
} |
||||
|
|
||||
|
public static void initFolder() { |
||||
|
File dir = new File(PATH); |
||||
|
if (!dir.exists()) { |
||||
|
boolean created = dir.mkdirs(); |
||||
|
if (created) logger.info("创建目录:{}", PATH); |
||||
|
else logger.error("目录创建失败:{}", PATH); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// ✅ 第一点:try‑with‑resources 安全资源管理
|
||||
|
public static void saveText(String fileName, String content) throws IOException { |
||||
|
if (fileName == null || fileName.isBlank()) throw new IllegalArgumentException("文件名不能为空"); |
||||
|
if (content == null || content.isBlank()) { |
||||
|
logger.warn("保存文件内容为空,跳过:{}", fileName); |
||||
|
return; |
||||
|
} |
||||
|
try (FileWriter fw = new FileWriter(PATH + fileName)) { |
||||
|
fw.write(content); |
||||
|
} |
||||
|
logger.info("文件保存成功:{}", fileName); |
||||
|
} |
||||
|
|
||||
|
public static <T> void addAll(String fileName, List<T> dataList) throws IOException { |
||||
|
if (dataList == null) throw new NullPointerException("待保存数据列表不能为null"); |
||||
|
if (dataList.isEmpty()) { |
||||
|
logger.warn("批量数据为空,跳过保存:{}", fileName); |
||||
|
return; |
||||
|
} |
||||
|
StringBuilder sb = new StringBuilder(); |
||||
|
dataList.forEach(item -> { |
||||
|
if (item != null) sb.append(item).append("\r\n"); |
||||
|
}); |
||||
|
saveText(fileName, sb.toString()); |
||||
|
} |
||||
|
|
||||
|
// ✅ 第二点:JSON 序列化持久化导出
|
||||
|
public static <T> void exportJson(String fileName, List<T> dataList) throws IOException { |
||||
|
if (dataList == null || dataList.isEmpty()) { |
||||
|
logger.warn("JSON导出:数据为空,跳过"); |
||||
|
return; |
||||
|
} |
||||
|
try (FileWriter fw = new FileWriter(PATH + fileName)) { |
||||
|
String jsonStr = JSON.toJSONString(dataList, JSONWriter.Feature.PrettyFormat); |
||||
|
fw.write(jsonStr); |
||||
|
} |
||||
|
logger.info("✅ JSON文件导出成功:{}", fileName); |
||||
|
} |
||||
|
|
||||
|
// ✅ 第三点:数据导入恢复会话(已加入解析异常捕获)
|
||||
|
public static <T> List<T> importJson(String fileName, Class<T> clazz) throws IOException, ParseException { |
||||
|
File file = new File(PATH + fileName); |
||||
|
if (!file.exists()) { |
||||
|
logger.warn("导入文件不存在:{}", fileName); |
||||
|
return Collections.emptyList(); |
||||
|
} |
||||
|
try (FileReader fr = new FileReader(file)) { |
||||
|
char[] buf = new char[(int) file.length()]; |
||||
|
fr.read(buf); |
||||
|
String jsonStr = new String(buf); |
||||
|
try { |
||||
|
return JSON.parseArray(jsonStr, clazz); |
||||
|
} catch (com.alibaba.fastjson2.JSONException e) { |
||||
|
logger.error("JSON格式解析错误:{}", e.getMessage()); |
||||
|
throw new ParseException("数据格式错误,解析失败:" + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// ✅ 第四点:增量抓取 - 获取已爬取电影标题集合(去重用)
|
||||
|
public static Set<String> getExistMovieTitles(String fileName) { |
||||
|
try { |
||||
|
List<Movie> oldList = importJson(fileName, Movie.class); |
||||
|
return oldList.stream().map(Movie::getTitle).collect(Collectors.toSet()); |
||||
|
} catch (IOException | ParseException e) { |
||||
|
logger.warn("读取历史电影数据失败,将全量抓取", e); |
||||
|
return new HashSet<>(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// ✅ 第四点:增量抓取 - 获取已爬取英雄名称集合(去重用)
|
||||
|
public static Set<String> getExistHeroNames(String fileName) { |
||||
|
try { |
||||
|
List<Hero> oldList = importJson(fileName, Hero.class); |
||||
|
return oldList.stream().map(Hero::getName).collect(Collectors.toSet()); |
||||
|
} catch (IOException | ParseException e) { |
||||
|
logger.warn("读取历史英雄数据失败,将全量抓取", e); |
||||
|
return new HashSet<>(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// ✅ 第四点:增量抓取 - 获取已爬取城市名集合(天气去重用)
|
||||
|
public static Set<String> getExistWeatherCities(String fileName) { |
||||
|
try { |
||||
|
List<Weather> oldList = importJson(fileName, Weather.class); |
||||
|
return oldList.stream().map(Weather::getCity).collect(Collectors.toSet()); |
||||
|
} catch (IOException | ParseException e) { |
||||
|
logger.warn("读取历史天气数据失败,将全量抓取", e); |
||||
|
return new HashSet<>(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static void analyzeOnly(List<Movie> movieList, List<Hero> heroList) { |
||||
|
if (movieList == null || heroList == null) return; |
||||
|
logger.info("===== 电影&英雄数据分析(仅统计,不存储) ====="); |
||||
|
double sum = 0; |
||||
|
int validCount = 0; |
||||
|
for (Movie movie : movieList) { |
||||
|
try { |
||||
|
sum += movie.getRatingDouble(); |
||||
|
validCount++; |
||||
|
} catch (IllegalArgumentException e) { |
||||
|
logger.warn("电影评分解析失败,跳过:{}", movie.getTitle(), e); |
||||
|
} |
||||
|
} |
||||
|
if (validCount == 0) { |
||||
|
logger.error("无有效电影评分数据"); |
||||
|
return; |
||||
|
} |
||||
|
double avg = sum / validCount; |
||||
|
System.out.println("电影平均评分:" + String.format("%.2f", avg)); |
||||
|
long highScoreCount = movieList.stream() |
||||
|
.filter(m -> { |
||||
|
try { |
||||
|
return m.getRatingDouble() >= 8.5; |
||||
|
} catch (IllegalArgumentException e) { |
||||
|
return false; |
||||
|
} |
||||
|
}) |
||||
|
.count(); |
||||
|
System.out.println("8.5分以上电影数量:" + highScoreCount); |
||||
|
System.out.println("英雄总数量:" + heroList.size()); |
||||
|
logger.info("电影&英雄数据分析结束"); |
||||
|
} |
||||
|
|
||||
|
public static void analyzeWeatherOnly(List<Weather> weatherList) { |
||||
|
if (weatherList == null || weatherList.isEmpty()) { |
||||
|
logger.warn("天气数据为空,无法统计"); |
||||
|
return; |
||||
|
} |
||||
|
logger.info("===== 全国天气数据分析(仅统计,不存储) ====="); |
||||
|
|
||||
|
Map<String, Long> weatherTypeCount = weatherList.stream() |
||||
|
.collect(Collectors.groupingBy(Weather::getCondition, Collectors.counting())); |
||||
|
System.out.println("\n各天气类型数量:"); |
||||
|
weatherTypeCount.forEach((type, count) -> System.out.println(" " + type + ":" + count + "个")); |
||||
|
|
||||
|
List<Integer> temps = weatherList.stream() |
||||
|
.map(Weather::getTempNum) |
||||
|
.filter(t -> t != Integer.MIN_VALUE) |
||||
|
.collect(Collectors.toList()); |
||||
|
|
||||
|
if (!temps.isEmpty()) { |
||||
|
int maxTemp = Collections.max(temps); |
||||
|
int minTemp = Collections.min(temps); |
||||
|
double avgTemp = temps.stream().mapToInt(Integer::intValue).average().orElse(0); |
||||
|
System.out.println("\n温度统计(最高温):"); |
||||
|
System.out.println(" 最高温度:" + maxTemp + "℃"); |
||||
|
System.out.println(" 最低温度:" + minTemp + "℃"); |
||||
|
System.out.println(" 平均温度:" + String.format("%.1f", avgTemp) + "℃"); |
||||
|
} else { |
||||
|
System.out.println("无有效温度数据"); |
||||
|
} |
||||
|
logger.info("天气数据分析结束"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,33 @@ |
|||||
|
package view; |
||||
|
|
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.Scanner; |
||||
|
|
||||
|
// ===================== 9. MVC‑View:视图层 =====================
|
||||
|
public class CrawlerView { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawlerView.class); |
||||
|
private final Scanner scanner = new Scanner(System.in); |
||||
|
|
||||
|
public void showMenu() { |
||||
|
System.out.println("\n===== 爬虫CLI交互菜单 ====="); |
||||
|
System.out.println("1. 爬取豆瓣电影"); |
||||
|
System.out.println("2. 爬取王者荣耀英雄"); |
||||
|
System.out.println("3. 爬取全国天气"); |
||||
|
System.out.println("4. 电影&英雄数据分析(仅统计,不存储)"); |
||||
|
System.out.println("5. 天气数据分析(天气类型、最高/最低/平均温)"); |
||||
|
System.out.println("6. 导入历史数据"); |
||||
|
System.out.println("0. 退出程序"); |
||||
|
System.out.print("请输入操作指令:"); |
||||
|
} |
||||
|
|
||||
|
public int getInput() { |
||||
|
return Integer.parseInt(scanner.nextLine()); |
||||
|
} |
||||
|
|
||||
|
public void showMsg(String msg) { |
||||
|
System.out.println(msg); |
||||
|
logger.info(msg); |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue