diff --git a/project(期末项目报告)/202401070210-郑诗艺-期末实验报告.pdf b/project(期末项目报告)/202401070210-郑诗艺-期末实验报告.pdf new file mode 100644 index 0000000..eef9320 Binary files /dev/null and b/project(期末项目报告)/202401070210-郑诗艺-期末实验报告.pdf differ diff --git a/project(期末项目报告)/CrawlerMain2/.gitignore b/project(期末项目报告)/CrawlerMain2/.gitignore new file mode 100644 index 0000000..f68d109 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/.gitignore @@ -0,0 +1,29 @@ +### IntelliJ IDEA ### +out/ +!**/src/main/**/out/ +!**/src/test/**/out/ + +### Eclipse ### +.apt_generated +.classpath +.factorypath +.project +.settings +.springBeans +.sts4-cache +bin/ +!**/src/main/**/bin/ +!**/src/test/**/bin/ + +### NetBeans ### +/nbproject/private/ +/nbbuild/ +/dist/ +/nbdist/ +/.nb-gradle/ + +### VS Code ### +.vscode/ + +### Mac OS ### +.DS_Store \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/.idea/.gitignore b/project(期末项目报告)/CrawlerMain2/.idea/.gitignore new file mode 100644 index 0000000..7d05e99 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/.idea/.gitignore @@ -0,0 +1,10 @@ +# 默认忽略的文件 +/shelf/ +/workspace.xml +# 基于编辑器的 HTTP 客户端请求 +/httpRequests/ +# 依赖于环境的 Maven 主目录路径 +/mavenHomeManager.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/project(期末项目报告)/CrawlerMain2/.idea/libraries/fastjson2_2_0_32.xml b/project(期末项目报告)/CrawlerMain2/.idea/libraries/fastjson2_2_0_32.xml new file mode 100644 index 0000000..5565c61 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/.idea/libraries/fastjson2_2_0_32.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/.idea/libraries/jcommon_1_0_24.xml b/project(期末项目报告)/CrawlerMain2/.idea/libraries/jcommon_1_0_24.xml new file mode 100644 index 0000000..cef0a8d --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/.idea/libraries/jcommon_1_0_24.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/.idea/libraries/jfreechart_1_5_3.xml b/project(期末项目报告)/CrawlerMain2/.idea/libraries/jfreechart_1_5_3.xml new file mode 100644 index 0000000..6fdf9d7 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/.idea/libraries/jfreechart_1_5_3.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/.idea/libraries/jsoup_1_17_2.xml b/project(期末项目报告)/CrawlerMain2/.idea/libraries/jsoup_1_17_2.xml new file mode 100644 index 0000000..90ce41d --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/.idea/libraries/jsoup_1_17_2.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/.idea/libraries/kumo_core_1_12.xml b/project(期末项目报告)/CrawlerMain2/.idea/libraries/kumo_core_1_12.xml new file mode 100644 index 0000000..c74069d --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/.idea/libraries/kumo_core_1_12.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/.idea/libraries/logback_classic_1_4_11.xml b/project(期末项目报告)/CrawlerMain2/.idea/libraries/logback_classic_1_4_11.xml new file mode 100644 index 0000000..54a73cf --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/.idea/libraries/logback_classic_1_4_11.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/.idea/libraries/logback_core_1_4_11.xml b/project(期末项目报告)/CrawlerMain2/.idea/libraries/logback_core_1_4_11.xml new file mode 100644 index 0000000..fbdb3a1 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/.idea/libraries/logback_core_1_4_11.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/.idea/libraries/slf4j_api_2_0_9.xml b/project(期末项目报告)/CrawlerMain2/.idea/libraries/slf4j_api_2_0_9.xml new file mode 100644 index 0000000..7c49634 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/.idea/libraries/slf4j_api_2_0_9.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/.idea/misc.xml b/project(期末项目报告)/CrawlerMain2/.idea/misc.xml new file mode 100644 index 0000000..3653b1f --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/.idea/modules.xml b/project(期末项目报告)/CrawlerMain2/.idea/modules.xml new file mode 100644 index 0000000..8824534 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/CrawlerMain2.iml b/project(期末项目报告)/CrawlerMain2/CrawlerMain2.iml new file mode 100644 index 0000000..b3ea8c0 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/CrawlerMain2.iml @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/README.md b/project(期末项目报告)/CrawlerMain2/README.md new file mode 100644 index 0000000..2d7bcc1 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/README.md @@ -0,0 +1,150 @@ +# Java爬虫综合项目(CLI + MVC + Command + 策略模式) +## 一、项目简介 +本项目实现了一个命令行菜单式爬虫,支持爬取豆瓣电影、王者荣耀英雄、中国天气网天气等数据。采用MVC、命令模式、策略模式、多层结构,集成日志体系与异常分包,代码结构规范,易于扩展和维护。 + +## 二、项目结构 +```` +CrawlerMain2 +├── .idea # IDEA 配置文件 +├── out # 编译输出目录 +└── src +├── command # 命令模式模块 +│ ├── AnalyzeCommand +│ ├── CommandInvoker +│ ├── CrawlCommand +│ ├── HeroCrawlCommand +│ ├── MovieCrawlCommand +│ ├── WeatherAnalyzeCommand +│ └── WeatherCrawlCommand +├── controller # 控制器与上下文 +│ └── CrawlerContext +├── crawler # 策略模式爬虫实现 +│ ├── BaseCrawler +│ ├── Crawler +│ ├── HeroCrawler +│ ├── MovieCrawler +│ └── WeatherCrawler +├── exception # 自定义异常体系 +├── model # 数据实体类 +│ ├── Hero +│ ├── Movie +│ └── Weather +├── util # 工具类 +│ └── DataUtil # JSON 导入导出、增量去重、文件 IO +├── view # 视图层(CLI 菜单交互) +│ └── CrawlerView +├── CrawlerMain # 程序入口 +└── logback.xml # 日志配置 +├── .gitignore # Git忽略文件配置 +└── CrawlerMain2.iml # IDEA模块配置 +```` +--- +## 三、功能介绍 + +- 命令行菜单,操作简单,支持多种数据源抓取 +- 豆瓣电影、王者荣耀英雄、中国天气网天气数据采集 +- 本地数据存储和分析统计 +- MVC分层、命令模式、策略模式设计,扩展方便 +- try-with-resources安全IO,保证资源释放与数据安全 +- 支持JSON序列化导出(movie.json等)与数据备份 +- 历史数据导入恢复功能,支持断点/回溯操作 +- 增量抓取机制,自动去重,避免重复采集 +- 日志体系与异常处理,项目健壮可追踪 + +--- + +## 四、依赖说明 + +- **JDK 8及以上版本** +- **jsoup**(网页解析) +- **slf4j**(日志接口) +- **logback**(日志实现) +- **fastjson2**(JSON处理框架,完成数据序列化/反序列化) +- IO相关类(java.io.File、FileReader、IOException等)实现文件操作 +- 集合类(Set、HashSet、Stream),用于增量抓取与数据去重 + +> 推荐使用 Maven(pom.xml)管理依赖,添加如下: + +```xml + + org.jsoup + jsoup + 1.15.3 + + +org.slf4j +slf4j-api +1.7.36 + + +ch.qos.logback +logback-classic +1.2.11 + + +com.alibaba.fastjson2 +fastjson2 +2.0.47 + +``` +如果不用 Maven,可手动下载相关 jar 包,并在 IDEA 的 Libraries 中添加。 + +--- +## 五、运行说明(IntelliJ IDEA) +1. 用 IDEA 打开项目根目录(包含 src/,logback.xml 等)。 +2. 配置 jsoup、slf4j、logback、fastjson2 依赖(建议用 Maven),或手动添加。 +3. 确认DataUtil.java里的数据存储路径有权限。如必要,修改为本地可用的目录。 +4. 右键CrawlerMain.java,选择“Run”,即可启动项目。 +5. 启动后按照命令行菜单提示输入数字操作,体验各类爬取、分析、导出、恢复功能: +```` + 1 爬取豆瓣电影 + 2 爬取王者荣耀英雄 + 3 爬取全国天气 + 4 电影、英雄数据分析(仅统计,不存储) + 5 天气数据分析 + 6 导入历史数据 + 0 退出程序 +```` +程序在抓取时自动生成 JSON 文件 (movie.json),支持导入历史数据并增量去重。 + +--- +## 六、主要设计与包说明 +| 包/类 | 功能描述 | +| ---------------- | ------------------------------------------ | +| exception | 自定义异常体系 | +| command | 命令模式相关类(命令封装、调度) | +| controller | 控制器上下文(业务流转管理) | +| strategy | 策略接口与具体爬虫实现 | +| model | 数据实体类(电影、英雄、天气) | +| util/DataUtil.java | 数据存储、历史恢复、JSON导入导出与增量去重 | +| view | CLI菜单视图,交互展示 | +--- +## 七、核心工具/新功能说明 +- DataUtil 工具类 + - 新增方法封装 JSON 序列化与反序列化(依赖 fastjson2 的 JSON 工具类) + - 支持文件安全读写(采用 try‑with‑resources 结构,自动释放 IO 资源) + - 增量抓取与去重,采用 Set/HashSet 结构自动过滤已采集的对象 + - 历史数据校验与导入,支持断点恢复(即重新导入 movie.json,恢复进度) + +--- +## 八、常见问题 +1. 存储权限报错:请修改 DataUtil 的数据路径为本机有权限的盘符或目录 +2. 依赖缺失或报错:确保 jsoup、slf4j、logback、fastjson2 已配置到项目 +3. JSON导入导出异常:检查 movie.json 是否存在且格式正确,或依赖版本是否兼容 +4. 重复抓取问题:增量去重会自动过滤相同对象,手动修改数据时注意唯一性 +5. 出现 IOException:请检查文件路径、权限是否正常 + +--- +## 九、扩展与二次开发说明 +- 新增网站爬虫或数据实体: + - 新建 strategy 类(如 NewSiteCrawler.java) + - 新建 command 类,添加进菜单与调度 + - 扩展 model 数据实体 +- 导出和导入功能可支持更多格式,只需扩展 DataUtil 工具类即可 + +--- +## 十、作者信息 +- 姓名:郑诗艺 +- 学号:202401070210 +- 班级:大数据管理与应用2402班 +- 日期:2026.5.24 diff --git a/project(期末项目报告)/CrawlerMain2/src/ClassDiagram.puml b/project(期末项目报告)/CrawlerMain2/src/ClassDiagram.puml new file mode 100644 index 0000000..40f7cca --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/ClassDiagram.puml @@ -0,0 +1,100 @@ +@startuml 爬虫项目类图_完整竖版一张图 +' 全局设置:垂直布局、紧凑、单张大图、禁止分页 +skinparam layout topdown +skinparam ranksep 55 +skinparam nodesep 18 +skinparam dpi 300 +skinparam page { + width 100% + height 100% +} +skinparam class { + BackgroundColor #f0f5ff + BorderColor #222222 + ArrowColor #222222 +} +skinparam package { + BackgroundColor #eef4ff + BorderColor #222222 +} + +' 从上到下依次排列所有包,不拆分 +package model { + class Hero { + -name: String + -type: String + } + class Movie { + -title: String + -rate: Double + } + class Weather { + -city: String + -temp: String + } +} + +package crawler { + interface Crawler { + +crawl(): List + } + abstract class BaseCrawler implements Crawler { + +crawl(): List + } + class MovieCrawler extends BaseCrawler + class HeroCrawler extends BaseCrawler + class WeatherCrawler extends BaseCrawler +} + +package exception { + class CrawlerException { + +CrawlerException(msg: String) + } + class NetworkException extends CrawlerException + class ParseException extends CrawlerException +} + +package command { + interface CrawlCommand { + +execute(): void + } + class CommandInvoker { + +invoke(): void + } + class MovieCrawlCommand implements CrawlCommand + class HeroCrawlCommand implements CrawlCommand + class WeatherCrawlCommand implements CrawlCommand + class AnalyzeCommand implements CrawlCommand + class WeatherAnalyzeCommand implements CrawlCommand + CommandInvoker -[dashed]-> CrawlCommand : 调度 +} + +package controller { + class CrawlerContext +} + +package view { + class CrawlerView { + +showMenu(): void + } +} + +package util { + class DataUtil { + +exportJson(): void + +importJson(): List + +removeDuplicate(): List + +analyzeOnly(): void + +analyzeWeatherOnly(): void + } +} + +class CrawlerMain + +' 依赖关系 +CrawlerMain --> CrawlerContext +CrawlerMain --> CrawlerView +CrawlerMain --> DataUtil +DataUtil -[dashed]-> ParseException : throws + +@enduml \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/src/CrawlerMain.java b/project(期末项目报告)/CrawlerMain2/src/CrawlerMain.java new file mode 100644 index 0000000..9e159f9 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/CrawlerMain.java @@ -0,0 +1,101 @@ +import command.AnalyzeCommand; +import command.HeroCrawlCommand; +import command.MovieCrawlCommand; +import command.WeatherAnalyzeCommand; +import command.WeatherCrawlCommand; +import command.CommandInvoker; +import controller.CrawlerContext; +import exception.CrawlerException; +import exception.NetworkException; +import exception.ParseException; +import model.Hero; +import model.Movie; +import model.Weather; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import util.DataUtil; +import view.CrawlerView; + +import java.io.IOException; +import java.util.List; + +// ===================== 10. 主程序入口 ===================== +public class CrawlerMain { + private static final Logger logger = LoggerFactory.getLogger(CrawlerMain.class); + + public static void main(String[] args) { + logger.info("===== 爬虫程序启动(CLI+MVC+Command+策略模式) ====="); + CrawlerView view = new CrawlerView(); + CrawlerContext context = new CrawlerContext(); + CommandInvoker invoker = new CommandInvoker(); + + MovieCrawlCommand movieCmd = new MovieCrawlCommand(context); + HeroCrawlCommand heroCmd = new HeroCrawlCommand(context); + WeatherCrawlCommand weatherCmd = new WeatherCrawlCommand(context); + AnalyzeCommand analyzeCmd = new AnalyzeCommand(movieCmd, heroCmd); + WeatherAnalyzeCommand weatherAnalyzeCmd = new WeatherAnalyzeCommand(weatherCmd); + + DataUtil.initFolder(); + + while (true) { + try { + view.showMenu(); + int op = view.getInput(); + switch (op) { + case 1: + invoker.runCommand(movieCmd); + view.showMsg("电影爬取完成"); + break; + case 2: + invoker.runCommand(heroCmd); + view.showMsg("英雄爬取完成"); + break; + case 3: + invoker.runCommand(weatherCmd); + view.showMsg("天气爬取完成"); + break; + case 4: + analyzeCmd.execute(); + break; + case 5: + weatherAnalyzeCmd.execute(); + break; + // 第6项:导入历史数据(无fileName:,无报错) + case 6: + try { + List movieList = DataUtil.importJson("movie.json", Movie.class); + List heroList = DataUtil.importJson("hero.json", Hero.class); + List weatherList = DataUtil.importJson("weather.json", Weather.class); + view.showMsg("✅ 历史数据导入成功!"); + view.showMsg("电影:" + movieList.size() + " 条"); + view.showMsg("英雄:" + heroList.size() + " 条"); + view.showMsg("天气:" + weatherList.size() + " 条"); + } catch (IOException e) { + view.showMsg("导入失败:" + e.getMessage()); + } + break; + case 0: + view.showMsg("程序退出"); + System.exit(0); + break; + default: + view.showMsg("指令错误,请重新输入"); + } + } catch (NumberFormatException e) { + view.showMsg("请输入数字!"); + } catch (NetworkException e) { + logger.error("网络异常:", e); + view.showMsg("网络异常:" + e.getMessage()); + } catch (ParseException e) { + logger.error("解析异常:", e); + view.showMsg("解析异常:" + e.getMessage()); + } catch (CrawlerException e) { + logger.error("爬虫异常:", e); + view.showMsg("爬虫异常:" + e.getMessage()); + }catch (IOException e) { + logger.error("IO异常:", e); + view.showMsg("IO异常:" + e.getMessage()); + } + } + } +} \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/src/command/AnalyzeCommand.java b/project(期末项目报告)/CrawlerMain2/src/command/AnalyzeCommand.java new file mode 100644 index 0000000..35bc41b --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/command/AnalyzeCommand.java @@ -0,0 +1,22 @@ +package command; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import util.DataUtil; + +public class AnalyzeCommand implements CrawlCommand { + private final MovieCrawlCommand movieCmd; + private final HeroCrawlCommand heroCmd; + private static final Logger logger = LoggerFactory.getLogger(AnalyzeCommand.class); + + public AnalyzeCommand(MovieCrawlCommand movieCmd, HeroCrawlCommand heroCmd) { + this.movieCmd = movieCmd; + this.heroCmd = heroCmd; + } + + @Override + public void execute() { + DataUtil.analyzeOnly(movieCmd.getResult(), heroCmd.getResult()); + logger.info("电影&英雄数据分析命令执行完成(仅统计)"); + } +} diff --git a/project(期末项目报告)/CrawlerMain2/src/command/CommandInvoker.java b/project(期末项目报告)/CrawlerMain2/src/command/CommandInvoker.java new file mode 100644 index 0000000..8bda699 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/command/CommandInvoker.java @@ -0,0 +1,13 @@ +package command; + +import exception.CrawlerException; +import exception.NetworkException; +import exception.ParseException; + +import java.io.IOException; + +public class CommandInvoker { + public void runCommand(CrawlCommand cmd) throws ParseException, NetworkException, CrawlerException, IOException { + cmd.execute(); + } +} diff --git a/project(期末项目报告)/CrawlerMain2/src/command/CrawlCommand.java b/project(期末项目报告)/CrawlerMain2/src/command/CrawlCommand.java new file mode 100644 index 0000000..bfcd5a2 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/command/CrawlCommand.java @@ -0,0 +1,12 @@ +package command; + +import exception.CrawlerException; +import exception.NetworkException; +import exception.ParseException; + +import java.io.IOException; + +// ===================== 8. Command模式 ===================== +public interface CrawlCommand { + void execute() throws ParseException, NetworkException, CrawlerException, IOException; +} diff --git a/project(期末项目报告)/CrawlerMain2/src/command/HeroCrawlCommand.java b/project(期末项目报告)/CrawlerMain2/src/command/HeroCrawlCommand.java new file mode 100644 index 0000000..4125917 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/command/HeroCrawlCommand.java @@ -0,0 +1,54 @@ +package command; + +import controller.CrawlerContext; +import crawler.HeroCrawler; +import exception.CrawlerException; +import exception.NetworkException; +import exception.ParseException; +import model.Hero; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import util.DataUtil; + +import java.io.IOException; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +public class HeroCrawlCommand implements CrawlCommand { + private final CrawlerContext context; + private List heroList; + private List oldHeroList; + + private static final Logger logger = LoggerFactory.getLogger(HeroCrawlCommand.class); + + public HeroCrawlCommand(CrawlerContext context) { + this.context = context; + } + + @Override + public void execute() throws ParseException, NetworkException, CrawlerException, IOException { + oldHeroList = DataUtil.importJson("hero.json", Hero.class); + logger.info("导入历史英雄数据:{}条", oldHeroList.size()); + + // 第四点:增量抓取,过滤重复英雄 + Set existNames = DataUtil.getExistHeroNames("hero.json"); + + context.setCrawlerStrategy(new HeroCrawler()); + heroList = (List) context.executeCrawl(); + + heroList = heroList.stream() + .filter(hero -> !existNames.contains(hero.getName())) + .collect(Collectors.toList()); + + heroList.addAll(oldHeroList); + + DataUtil.addAll("英雄数据.txt", heroList); + DataUtil.exportJson("hero.json", heroList); + logger.info("英雄爬取完成,本次新增:{}条", heroList.size() - oldHeroList.size()); + } + + public List getResult() { + return heroList; + } +} \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/src/command/MovieCrawlCommand.java b/project(期末项目报告)/CrawlerMain2/src/command/MovieCrawlCommand.java new file mode 100644 index 0000000..ceef898 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/command/MovieCrawlCommand.java @@ -0,0 +1,57 @@ +package command; + +import controller.CrawlerContext; +import crawler.MovieCrawler; +import exception.CrawlerException; +import exception.NetworkException; +import exception.ParseException; +import model.Movie; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import util.DataUtil; + +import java.io.IOException; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +public class MovieCrawlCommand implements CrawlCommand { + private final CrawlerContext context; + private List movieList; + private List oldMovieList; + + private static final Logger logger = LoggerFactory.getLogger(MovieCrawlCommand.class); + + public MovieCrawlCommand(CrawlerContext context) { + this.context = context; + } + + @Override + public void execute() throws ParseException, NetworkException, CrawlerException, IOException { + // 第三点:导入历史数据 + oldMovieList = DataUtil.importJson("movie.json", Movie.class); + logger.info("导入历史电影数据:{}条", oldMovieList.size()); + + // 第四点:获取已存在的电影,增量抓取,避免重复 + Set existTitles = DataUtil.getExistMovieTitles("movie.json"); + + context.setCrawlerStrategy(new MovieCrawler()); + movieList = (List) context.executeCrawl(); + + // 过滤掉已经存在的电影,只保留新数据 + movieList = movieList.stream() + .filter(movie -> !existTitles.contains(movie.getTitle())) + .collect(Collectors.toList()); + + // 合并:新数据 + 历史数据 + movieList.addAll(oldMovieList); + + DataUtil.addAll("电影数据.txt", movieList); + DataUtil.exportJson("movie.json", movieList); + logger.info("电影爬取完成,本次新增:{}条", movieList.size() - oldMovieList.size()); + } + + public List getResult() { + return movieList; + } +} \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/src/command/WeatherAnalyzeCommand.java b/project(期末项目报告)/CrawlerMain2/src/command/WeatherAnalyzeCommand.java new file mode 100644 index 0000000..1776add --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/command/WeatherAnalyzeCommand.java @@ -0,0 +1,20 @@ +package command; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import util.DataUtil; + +public class WeatherAnalyzeCommand implements CrawlCommand { + private final WeatherCrawlCommand weatherCmd; + private static final Logger logger = LoggerFactory.getLogger(WeatherAnalyzeCommand.class); + + public WeatherAnalyzeCommand(WeatherCrawlCommand weatherCmd) { + this.weatherCmd = weatherCmd; + } + + @Override + public void execute() { + DataUtil.analyzeWeatherOnly(weatherCmd.getResult()); + logger.info("天气数据分析命令执行完成(仅统计)"); + } +} diff --git a/project(期末项目报告)/CrawlerMain2/src/command/WeatherCrawlCommand.java b/project(期末项目报告)/CrawlerMain2/src/command/WeatherCrawlCommand.java new file mode 100644 index 0000000..08bdd3e --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/command/WeatherCrawlCommand.java @@ -0,0 +1,54 @@ +package command; + +import controller.CrawlerContext; +import crawler.WeatherCrawler; +import exception.CrawlerException; +import exception.NetworkException; +import exception.ParseException; +import model.Weather; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import util.DataUtil; + +import java.io.IOException; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +public class WeatherCrawlCommand implements CrawlCommand { + private final CrawlerContext context; + private List weatherList; + private List oldWeatherList; + + private static final Logger logger = LoggerFactory.getLogger(WeatherCrawlCommand.class); + + public WeatherCrawlCommand(CrawlerContext context) { + this.context = context; + } + + @Override + public void execute() throws ParseException, NetworkException, CrawlerException, IOException { + oldWeatherList = DataUtil.importJson("weather.json", Weather.class); + logger.info("导入历史天气数据:{}条", oldWeatherList.size()); + + // 第四点:增量抓取,过滤重复城市 + Set existCities = DataUtil.getExistWeatherCities("weather.json"); + + context.setCrawlerStrategy(new WeatherCrawler()); + weatherList = (List) context.executeCrawl(); + + weatherList = weatherList.stream() + .filter(weather -> !existCities.contains(weather.getCity())) + .collect(Collectors.toList()); + + weatherList.addAll(oldWeatherList); + + DataUtil.addAll("天气数据.txt", weatherList); + DataUtil.exportJson("weather.json", weatherList); + logger.info("天气爬取完成,本次新增:{}条", weatherList.size() - oldWeatherList.size()); + } + + public List getResult() { + return weatherList; + } +} \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/src/controller/CrawlerContext.java b/project(期末项目报告)/CrawlerMain2/src/controller/CrawlerContext.java new file mode 100644 index 0000000..fbc7975 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/controller/CrawlerContext.java @@ -0,0 +1,28 @@ +package controller; + +import crawler.Crawler; +import exception.CrawlerException; +import exception.NetworkException; +import exception.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; + +// ===================== 6. 策略上下文 ===================== +public class CrawlerContext { + private Crawler crawlerStrategy; + private static final Logger logger = LoggerFactory.getLogger(CrawlerContext.class); + + public void setCrawlerStrategy(Crawler crawlerStrategy) { + this.crawlerStrategy = crawlerStrategy; + } + + public List executeCrawl() throws ParseException, NetworkException, CrawlerException { + if (crawlerStrategy == null) { + logger.error("未设置爬取策略"); + throw new CrawlerException("爬取策略未配置"); + } + return crawlerStrategy.startCrawl(); + } +} diff --git a/project(期末项目报告)/CrawlerMain2/src/crawler/BaseCrawler.java b/project(期末项目报告)/CrawlerMain2/src/crawler/BaseCrawler.java new file mode 100644 index 0000000..461be03 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/crawler/BaseCrawler.java @@ -0,0 +1,64 @@ +package crawler; + +import exception.CrawlerException; +import exception.NetworkException; +import exception.ParseException; +import org.jsoup.Connection; +import org.jsoup.HttpStatusException; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; + +public abstract class BaseCrawler implements Crawler { + protected final String baseUrl; + private static final Logger logger = LoggerFactory.getLogger(BaseCrawler.class); + + public BaseCrawler(String baseUrl) { + this.baseUrl = baseUrl; + } + + // 定义和子类匹配的抽象方法签名 + public abstract List startCrawl() throws ParseException, NetworkException, CrawlerException; + + // 统一请求页面方法(加固防拦截) + public Document getPage(String url) throws NetworkException { + String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"; + String cookie = "ptui_loginuin=; pgv_pvid=123456; RK=randomtest; _qpsvr_localtest=; uin=;"; + + int retry = 3; + while (retry > 0) { + try { + Connection conn = Jsoup.connect(url) + .userAgent(userAgent) + .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + .header("Accept-Language", "zh-CN,zh;q=0.9") + .header("Referer", url.contains("douban") ? "https://movie.douban.com/" : "https://pvp.qq.com/") + .header("Connection", "keep-alive") + .cookie("Cookie", cookie) + .timeout(15000) + .followRedirects(true); + + Document doc = conn.get(); + logger.info("第{}次请求页面:{}", 4 - retry, url); + return doc; + } catch (HttpStatusException e) { + retry--; + logger.error("请求页面失败,剩余重试次数:{}", retry, e); + if (retry <= 0) { + throw new NetworkException("页面请求彻底失败:" + url, e); + } + try { + Thread.sleep(2000); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + } + } catch (Exception e) { + throw new NetworkException("页面请求异常:" + url, e); + } + } + throw new NetworkException("请求超时:" + url); + } +} \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/src/crawler/Crawler.java b/project(期末项目报告)/CrawlerMain2/src/crawler/Crawler.java new file mode 100644 index 0000000..de0c2a9 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/crawler/Crawler.java @@ -0,0 +1,12 @@ +package crawler; + +import exception.CrawlerException; +import exception.NetworkException; +import exception.ParseException; + +import java.util.List; + +// ===================== 2. 策略模式:抽象策略接口 ===================== +public interface Crawler { + List startCrawl() throws ParseException, NetworkException, CrawlerException; +} diff --git a/project(期末项目报告)/CrawlerMain2/src/crawler/HeroCrawler.java b/project(期末项目报告)/CrawlerMain2/src/crawler/HeroCrawler.java new file mode 100644 index 0000000..e304132 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/crawler/HeroCrawler.java @@ -0,0 +1,43 @@ +package crawler; + +import exception.CrawlerException; +import exception.NetworkException; +import exception.ParseException; +import model.Hero; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +public class HeroCrawler extends BaseCrawler { + private static final Logger logger = LoggerFactory.getLogger(HeroCrawler.class); + + public HeroCrawler() { + super("https://pvp.qq.com/web201605/herolist.shtml"); + } + + @Override + public List startCrawl() throws ParseException, NetworkException, CrawlerException { + List list = new ArrayList<>(); + logger.info("开始爬取王者荣耀英雄数据"); + try { + Document doc = getPage(baseUrl); + Elements heros = doc.select("ul.herolist li a"); + if (heros.isEmpty()) throw new ParseException("页面解析失败:未找到英雄列表项"); + for (Element h : heros) { + String name = h.text().trim(); + if (!name.isEmpty()) list.add(new Hero(name)); + } + logger.info("英雄爬取完成,共{}条数据", list.size()); + } catch (NetworkException e) { + throw e; + } catch (Exception e) { + throw new ParseException("英雄数据解析异常", e); + } + return list; + } +} diff --git a/project(期末项目报告)/CrawlerMain2/src/crawler/MovieCrawler.java b/project(期末项目报告)/CrawlerMain2/src/crawler/MovieCrawler.java new file mode 100644 index 0000000..b58611a --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/crawler/MovieCrawler.java @@ -0,0 +1,56 @@ +package crawler; + +import exception.CrawlerException; +import exception.NetworkException; +import exception.ParseException; +import model.Movie; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +// ===================== 5. 具体策略爬虫类 ===================== +public class MovieCrawler extends BaseCrawler { + private static final Logger logger = LoggerFactory.getLogger(MovieCrawler.class); + + public MovieCrawler() { + super("https://movie.douban.com/top250"); + } + + @Override + public List startCrawl() throws ParseException, NetworkException, CrawlerException { + List list = new ArrayList<>(); + logger.info("开始爬取豆瓣电影Top250"); + try { + for (int i = 0; i < 250; i += 25) { + Document doc = getPage(baseUrl + "?start=" + i); + Elements items = doc.select(".item"); + if (items.isEmpty()) throw new ParseException("页面解析失败:未找到电影列表项"); + for (Element e : items) { + Element titleEle = e.select(".title").first(); + Element ratingEle = e.select(".rating_num").first(); + if (titleEle == null || ratingEle == null) { + logger.warn("单条电影数据解析失败,跳过"); + continue; + } + String title = titleEle.text().split("/")[0].trim(); + String rating = ratingEle.text(); + list.add(new Movie(title, rating)); + } + Thread.sleep(1000); + } + logger.info("豆瓣电影爬取完成,共{}条数据", list.size()); + } catch (NetworkException e) { + throw e; + } catch (InterruptedException e) { + throw new CrawlerException("爬取被中断", e); + } catch (Exception e) { + throw new ParseException("电影数据解析异常", e); + } + return list; + } +} diff --git a/project(期末项目报告)/CrawlerMain2/src/crawler/WeatherCrawler.java b/project(期末项目报告)/CrawlerMain2/src/crawler/WeatherCrawler.java new file mode 100644 index 0000000..5a59b8e --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/crawler/WeatherCrawler.java @@ -0,0 +1,74 @@ +package crawler; + +import exception.CrawlerException; +import exception.NetworkException; +import exception.ParseException; +import model.Weather; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +public class WeatherCrawler extends BaseCrawler { + private static final Logger logger = LoggerFactory.getLogger(WeatherCrawler.class); + private static final String[][] cities = { + {"北京", "北京", "101010100"}, {"上海", "上海", "101020100"}, {"天津", "天津", "101030100"}, {"重庆", "重庆", "101040100"}, + {"河北", "石家庄", "101090101"}, {"山西", "太原", "101100101"}, {"辽宁", "沈阳", "101070101"}, {"吉林", "长春", "101060101"}, + {"黑龙江", "哈尔滨", "101050101"}, {"江苏", "南京", "101190101"}, {"浙江", "杭州", "101210101"}, {"安徽", "合肥", "101220101"}, + {"福建", "福州", "101230101"}, {"江西", "南昌", "101240101"}, {"山东", "济南", "101120101"}, {"河南", "郑州", "101180101"}, + {"湖北", "武汉", "101200101"}, {"湖南", "长沙", "101250101"}, {"广东", "广州", "101280101"}, {"海南", "海口", "101310101"}, + {"四川", "成都", "101270101"}, {"贵州", "贵阳", "101260101"}, {"云南", "昆明", "101290101"}, {"陕西", "西安", "101110101"}, + {"甘肃", "兰州", "101160101"}, {"青海", "西宁", "101150101"}, {"内蒙古", "呼和浩特", "101080101"}, {"广西", "南宁", "101300101"}, + {"西藏", "拉萨", "101140101"}, {"宁夏", "银川", "101170101"}, {"新疆", "乌鲁木齐", "101130101"}, + {"香港", "香港", "101320101"}, {"澳门", "澳门", "101330101"}, {"台湾", "台北", "101340101"} + }; + + public WeatherCrawler() { + super("https://www.weather.com.cn/weather/"); + } + + @Override + public List startCrawl() throws ParseException, NetworkException, CrawlerException { + List list = new ArrayList<>(); + logger.info("开始爬取全国城市实时温度数据"); + try { + for (String[] city : cities) { + String province = city[0]; + String cityName = city[1]; + String code = city[2]; + Document doc = getPage(baseUrl + code + ".shtml"); + + // 取7天预报【第1个li】=今日实时温度,全页面通用、绝不空指针 + Element today = doc.select("ul.t li").first(); + if (today == null) throw new ParseException("实时天气解析失败:" + cityName); + + String tempStr = today.select(".tem").text(); + String weaStr = today.select(".wea").text(); + + // 拆分:最高温/最低温,取**最高温作为实时温度** + String realTemp; + if(tempStr.contains("/")){ + realTemp = tempStr.split("/")[0]; + }else if(tempStr.contains("~")){ + realTemp = tempStr.split("~")[0]; + }else{ + realTemp = tempStr; + } + + list.add(new Weather(province, cityName, weaStr, realTemp)); + Thread.sleep(500); + } + logger.info("实时天气爬取完成,共{}条数据", list.size()); + } catch (NetworkException e) { + throw e; + } catch (InterruptedException e) { + throw new CrawlerException("爬取线程被中断", e); + } catch (Exception e) { + throw new ParseException("天气数据解析异常", e); + } + return list; + } +} \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/src/exception/CrawlerException.java b/project(期末项目报告)/CrawlerMain2/src/exception/CrawlerException.java new file mode 100644 index 0000000..628602b --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/exception/CrawlerException.java @@ -0,0 +1,10 @@ +package exception; + +public class CrawlerException extends Exception { + public CrawlerException(String message) { + super(message); + } + public CrawlerException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/project(期末项目报告)/CrawlerMain2/src/exception/NetworkException.java b/project(期末项目报告)/CrawlerMain2/src/exception/NetworkException.java new file mode 100644 index 0000000..aa62667 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/exception/NetworkException.java @@ -0,0 +1,10 @@ +package exception; + +public class NetworkException extends CrawlerException { + public NetworkException(String message) { + super(message); + } + public NetworkException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/project(期末项目报告)/CrawlerMain2/src/exception/ParseException.java b/project(期末项目报告)/CrawlerMain2/src/exception/ParseException.java new file mode 100644 index 0000000..6092fa4 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/exception/ParseException.java @@ -0,0 +1,10 @@ +package exception; + +public class ParseException extends CrawlerException { + public ParseException(String message) { + super(message); + } + public ParseException(String message, Throwable cause) { + super(message, cause); + } +} \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/src/logback.xml b/project(期末项目报告)/CrawlerMain2/src/logback.xml new file mode 100644 index 0000000..0c498dc --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/logback.xml @@ -0,0 +1,13 @@ + + + + + + %d{HH:mm:ss.SSS} [%level] %logger - %msg%n + UTF-8 + + + + + + \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/src/model/Hero.java b/project(期末项目报告)/CrawlerMain2/src/model/Hero.java new file mode 100644 index 0000000..69d82bd --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/model/Hero.java @@ -0,0 +1,18 @@ +package model; + +public class Hero { + private final String name; + + public Hero(String name) { + this.name = name; + } + + public String getName() { + return name; + } + + @Override + public String toString() { + return "英雄:" + name; + } +} diff --git a/project(期末项目报告)/CrawlerMain2/src/model/Movie.java b/project(期末项目报告)/CrawlerMain2/src/model/Movie.java new file mode 100644 index 0000000..3d40e7f --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/model/Movie.java @@ -0,0 +1,33 @@ +package model; + +// ===================== 4. MVC‑Model:实体类 ===================== +public class Movie { + private final String title; + private final String rating; + + public Movie(String title, String rating) { + this.title = title; + this.rating = rating; + } + + public String getTitle() { + return title; + } + + public double getRatingDouble() { + try { + return Double.parseDouble(rating); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("评分格式错误:" + rating, e); + } + } + + public String getRating() { + return rating; + } + + @Override + public String toString() { + return "电影:《" + title + "》 | 评分:" + rating; + } +} diff --git a/project(期末项目报告)/CrawlerMain2/src/model/Weather.java b/project(期末项目报告)/CrawlerMain2/src/model/Weather.java new file mode 100644 index 0000000..d8d0c79 --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/model/Weather.java @@ -0,0 +1,34 @@ +package model; + +public class Weather { + private final String province; + private final String city; + private final String condition; + private final String temperature; // 实时温度 + + public Weather(String province, String city, String condition, String temperature) { + this.province = province; + this.city = city; + this.condition = condition; + this.temperature = temperature; + } + + public String getProvince() { return province; } + public String getCity() { return city; } + public String getCondition() { return condition; } + public String getTemperature() { return temperature; } + + // 用于排序,提取温度数字 + public int getTempNum() { + try { + return Integer.parseInt(temperature.replaceAll("[^0-9]", "")); + } catch (Exception e) { + return Integer.MIN_VALUE; + } + } + + @Override + public String toString() { + return "省份:" + province + " | 城市:" + city + " | 天气:" + condition + " | 实时温度:" + temperature; + } +} \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/src/util/DataUtil.java b/project(期末项目报告)/CrawlerMain2/src/util/DataUtil.java new file mode 100644 index 0000000..597f35f --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/util/DataUtil.java @@ -0,0 +1,195 @@ +package util; + +import com.alibaba.fastjson2.JSON; +import com.alibaba.fastjson2.JSONWriter; +import exception.ParseException; +import model.Hero; +import model.Movie; +import model.Weather; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.HashSet; +import java.util.Set; + +// ===================== 7. MVC‑Repository:数据仓库 ===================== +public final class DataUtil { + private static final String PATH = "D:\\Java爬虫\\"; + private static final Logger logger = LoggerFactory.getLogger(DataUtil.class); + + private DataUtil() { + } + + public static void initFolder() { + File dir = new File(PATH); + if (!dir.exists()) { + boolean created = dir.mkdirs(); + if (created) logger.info("创建目录:{}", PATH); + else logger.error("目录创建失败:{}", PATH); + } + } + + // ✅ 第一点:try‑with‑resources 安全资源管理 + public static void saveText(String fileName, String content) throws IOException { + if (fileName == null || fileName.isBlank()) throw new IllegalArgumentException("文件名不能为空"); + if (content == null || content.isBlank()) { + logger.warn("保存文件内容为空,跳过:{}", fileName); + return; + } + try (FileWriter fw = new FileWriter(PATH + fileName)) { + fw.write(content); + } + logger.info("文件保存成功:{}", fileName); + } + + public static void addAll(String fileName, List dataList) throws IOException { + if (dataList == null) throw new NullPointerException("待保存数据列表不能为null"); + if (dataList.isEmpty()) { + logger.warn("批量数据为空,跳过保存:{}", fileName); + return; + } + StringBuilder sb = new StringBuilder(); + dataList.forEach(item -> { + if (item != null) sb.append(item).append("\r\n"); + }); + saveText(fileName, sb.toString()); + } + + // ✅ 第二点:JSON 序列化持久化导出 + public static void exportJson(String fileName, List dataList) throws IOException { + if (dataList == null || dataList.isEmpty()) { + logger.warn("JSON导出:数据为空,跳过"); + return; + } + try (FileWriter fw = new FileWriter(PATH + fileName)) { + String jsonStr = JSON.toJSONString(dataList, JSONWriter.Feature.PrettyFormat); + fw.write(jsonStr); + } + logger.info("✅ JSON文件导出成功:{}", fileName); + } + + // ✅ 第三点:数据导入恢复会话(已加入解析异常捕获) + public static List importJson(String fileName, Class clazz) throws IOException, ParseException { + File file = new File(PATH + fileName); + if (!file.exists()) { + logger.warn("导入文件不存在:{}", fileName); + return Collections.emptyList(); + } + try (FileReader fr = new FileReader(file)) { + char[] buf = new char[(int) file.length()]; + fr.read(buf); + String jsonStr = new String(buf); + try { + return JSON.parseArray(jsonStr, clazz); + } catch (com.alibaba.fastjson2.JSONException e) { + logger.error("JSON格式解析错误:{}", e.getMessage()); + throw new ParseException("数据格式错误,解析失败:" + e.getMessage()); + } + } + } + + // ✅ 第四点:增量抓取 - 获取已爬取电影标题集合(去重用) + public static Set getExistMovieTitles(String fileName) { + try { + List oldList = importJson(fileName, Movie.class); + return oldList.stream().map(Movie::getTitle).collect(Collectors.toSet()); + } catch (IOException | ParseException e) { + logger.warn("读取历史电影数据失败,将全量抓取", e); + return new HashSet<>(); + } + } + + // ✅ 第四点:增量抓取 - 获取已爬取英雄名称集合(去重用) + public static Set getExistHeroNames(String fileName) { + try { + List oldList = importJson(fileName, Hero.class); + return oldList.stream().map(Hero::getName).collect(Collectors.toSet()); + } catch (IOException | ParseException e) { + logger.warn("读取历史英雄数据失败,将全量抓取", e); + return new HashSet<>(); + } + } + + // ✅ 第四点:增量抓取 - 获取已爬取城市名集合(天气去重用) + public static Set getExistWeatherCities(String fileName) { + try { + List oldList = importJson(fileName, Weather.class); + return oldList.stream().map(Weather::getCity).collect(Collectors.toSet()); + } catch (IOException | ParseException e) { + logger.warn("读取历史天气数据失败,将全量抓取", e); + return new HashSet<>(); + } + } + + public static void analyzeOnly(List movieList, List heroList) { + if (movieList == null || heroList == null) return; + logger.info("===== 电影&英雄数据分析(仅统计,不存储) ====="); + double sum = 0; + int validCount = 0; + for (Movie movie : movieList) { + try { + sum += movie.getRatingDouble(); + validCount++; + } catch (IllegalArgumentException e) { + logger.warn("电影评分解析失败,跳过:{}", movie.getTitle(), e); + } + } + if (validCount == 0) { + logger.error("无有效电影评分数据"); + return; + } + double avg = sum / validCount; + System.out.println("电影平均评分:" + String.format("%.2f", avg)); + long highScoreCount = movieList.stream() + .filter(m -> { + try { + return m.getRatingDouble() >= 8.5; + } catch (IllegalArgumentException e) { + return false; + } + }) + .count(); + System.out.println("8.5分以上电影数量:" + highScoreCount); + System.out.println("英雄总数量:" + heroList.size()); + logger.info("电影&英雄数据分析结束"); + } + + public static void analyzeWeatherOnly(List weatherList) { + if (weatherList == null || weatherList.isEmpty()) { + logger.warn("天气数据为空,无法统计"); + return; + } + logger.info("===== 全国天气数据分析(仅统计,不存储) ====="); + + Map weatherTypeCount = weatherList.stream() + .collect(Collectors.groupingBy(Weather::getCondition, Collectors.counting())); + System.out.println("\n各天气类型数量:"); + weatherTypeCount.forEach((type, count) -> System.out.println(" " + type + ":" + count + "个")); + + List temps = weatherList.stream() + .map(Weather::getTempNum) + .filter(t -> t != Integer.MIN_VALUE) + .collect(Collectors.toList()); + + if (!temps.isEmpty()) { + int maxTemp = Collections.max(temps); + int minTemp = Collections.min(temps); + double avgTemp = temps.stream().mapToInt(Integer::intValue).average().orElse(0); + System.out.println("\n温度统计(最高温):"); + System.out.println(" 最高温度:" + maxTemp + "℃"); + System.out.println(" 最低温度:" + minTemp + "℃"); + System.out.println(" 平均温度:" + String.format("%.1f", avgTemp) + "℃"); + } else { + System.out.println("无有效温度数据"); + } + logger.info("天气数据分析结束"); + } +} \ No newline at end of file diff --git a/project(期末项目报告)/CrawlerMain2/src/view/CrawlerView.java b/project(期末项目报告)/CrawlerMain2/src/view/CrawlerView.java new file mode 100644 index 0000000..01584cd --- /dev/null +++ b/project(期末项目报告)/CrawlerMain2/src/view/CrawlerView.java @@ -0,0 +1,33 @@ +package view; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Scanner; + +// ===================== 9. MVC‑View:视图层 ===================== +public class CrawlerView { + private static final Logger logger = LoggerFactory.getLogger(CrawlerView.class); + private final Scanner scanner = new Scanner(System.in); + + public void showMenu() { + System.out.println("\n===== 爬虫CLI交互菜单 ====="); + System.out.println("1. 爬取豆瓣电影"); + System.out.println("2. 爬取王者荣耀英雄"); + System.out.println("3. 爬取全国天气"); + System.out.println("4. 电影&英雄数据分析(仅统计,不存储)"); + System.out.println("5. 天气数据分析(天气类型、最高/最低/平均温)"); + System.out.println("6. 导入历史数据"); + System.out.println("0. 退出程序"); + System.out.print("请输入操作指令:"); + } + + public int getInput() { + return Integer.parseInt(scanner.nextLine()); + } + + public void showMsg(String msg) { + System.out.println(msg); + logger.info(msg); + } +}