From 627f053698912b8671a10e08d29a5568068e30f2 Mon Sep 17 00:00:00 2001 From: WangYangyang <3093159564@qq.com> Date: Tue, 12 May 2026 17:56:15 +0800 Subject: [PATCH] =?UTF-8?q?=E7=8E=8B=E7=83=8A=E7=83=8A202302050115W10?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- w10/AI架构审计.md | 78 +++++++++++++++++++ w10/datacollect/Main.java | 21 +++++ w10/datacollect/command/AnalyzeCommand.java | 65 ++++++++++++++++ w10/datacollect/command/Command.java | 8 ++ w10/datacollect/command/CrawlCommand.java | 50 ++++++++++++ w10/datacollect/command/ExitCommand.java | 23 ++++++ w10/datacollect/command/HelpCommand.java | 22 ++++++ w10/datacollect/command/ListCommand.java | 22 ++++++ .../controller/CrawlerController.java | 49 ++++++++++++ w10/datacollect/model/Article.java | 45 +++++++++++ .../repository/ArticleRepository.java | 40 ++++++++++ w10/datacollect/strategy/BlogStrategy.java | 30 +++++++ w10/datacollect/strategy/CrawlStrategy.java | 14 ++++ w10/datacollect/strategy/DefaultStrategy.java | 27 +++++++ w10/datacollect/strategy/HnuNewsStrategy.java | 55 +++++++++++++ w10/datacollect/strategy/NewsStrategy.java | 32 ++++++++ w10/datacollect/strategy/StrategyFactory.java | 29 +++++++ w10/datacollect/view/ConsoleView.java | 42 ++++++++++ w10/类图.svg | 1 + 19 files changed, 653 insertions(+) create mode 100644 w10/AI架构审计.md create mode 100644 w10/datacollect/Main.java create mode 100644 w10/datacollect/command/AnalyzeCommand.java create mode 100644 w10/datacollect/command/Command.java create mode 100644 w10/datacollect/command/CrawlCommand.java create mode 100644 w10/datacollect/command/ExitCommand.java create mode 100644 w10/datacollect/command/HelpCommand.java create mode 100644 w10/datacollect/command/ListCommand.java create mode 100644 w10/datacollect/controller/CrawlerController.java create mode 100644 w10/datacollect/model/Article.java create mode 100644 w10/datacollect/repository/ArticleRepository.java create mode 100644 w10/datacollect/strategy/BlogStrategy.java create mode 100644 w10/datacollect/strategy/CrawlStrategy.java create mode 100644 w10/datacollect/strategy/DefaultStrategy.java create mode 100644 w10/datacollect/strategy/HnuNewsStrategy.java create mode 100644 w10/datacollect/strategy/NewsStrategy.java create mode 100644 w10/datacollect/strategy/StrategyFactory.java create mode 100644 w10/datacollect/view/ConsoleView.java create mode 100644 w10/类图.svg diff --git a/w10/AI架构审计.md b/w10/AI架构审计.md new file mode 100644 index 0000000..63f60c0 --- /dev/null +++ b/w10/AI架构审计.md @@ -0,0 +1,78 @@ +作为 Java 架构审计师,基于您提供的代码库(包含核心任务及选做任务的实现),以下是针对 **策略模式解耦**、**Repository 封装** 及 **工厂性能** 的深度审计报告。 + +--- + +### 📋 架构审计报告 + +#### 1. 策略模式解耦性审计 (Strategy Pattern Decoupling) +**✅ 审计结论:基本正确,符合开闭原则。** + +* **现状分析**: + * `CrawlCommand` 和 `AnalyzeCommand` 均通过 `StrategyFactory` 获取策略,并未在命令内部硬编码 `if (url.contains("hnu"))` 等具体网站逻辑。 + * 解析逻辑完全下沉到 `HnuNewsStrategy` 等具体策略类中。 + * `AnalyzeCommand` 复用了 `CrawlStrategy` 的 `parse` 方法,实现了“一次实现,多处复用”的目标。 +* **潜在风险**: + * 如果 `supports()` 方法内部包含复杂的 URL 解析逻辑,可能会污染策略接口。 + * `AnalyzeCommand` 目前虽然不存数据,但如果未来需要“分析后自动保存”,它可能会直接调用 `repository`,导致职责不清。 +* **改进建议**: + 1. **保持现状**:目前的解耦设计是合理的,无需大改。 + 2. **接口纯净化**:确保 `CrawlStrategy` 接口中不包含任何与“存储”相关的逻辑,只关注“解析”。 + 3. **异常隔离**:在 `CrawlCommand` 中捕获 `Jsoup` 异常时,建议区分网络异常和解析异常,以便策略层能针对性处理(例如重试或跳过)。 + +#### 2. Repository 数据访问封装审计 (Repository Encapsulation) +**⚠️ 审计结论:封装性良好,但存在数据模型可变性风险。** + +* **现状分析**: + * `ArticleRepository` 内部持有 `List
`,外部无法直接获取引用(`getAll()` 返回 `Collections.unmodifiableList`)。 + * `addAll` 方法增加了 `null` 检查,符合防御性编程。 + * 命令层(`CrawlCommand`)通过 `repository.add()` 操作数据,没有绕过 Repository 直接操作 List。 +* **潜在风险**: + 1. **模型可变性**:`Article` 类提供了 `setTitle`, `setContent` 等 Setter 方法。这意味着外部获取到 `Article` 对象后,可以修改其状态,破坏了 Repository 的数据一致性封装。 + 2. **线程安全**:`ArticleRepository` 内部使用 `ArrayList`,在多线程环境下(如未来扩展并发爬虫)存在 `ConcurrentModificationException` 风险。 + 3. **资源泄露**:`ConsoleView` 中的 `Scanner` 未关闭,虽然程序退出时 OS 会回收,但规范上应实现 `AutoCloseable`。 +* **改进建议**: + 1. **不可变模型**:将 `Article` 类改为 **不可变类 (Immutable)**。移除所有 Setter 方法,构造函数中初始化所有字段。 + 2. **线程安全**:将 `ArticleRepository` 内部 List 替换为 `CopyOnWriteArrayList` 或 `Collections.synchronizedList`,或者在 Repository 层面加锁。 + 3. **资源管理**:`ConsoleView` 实现 `AutoCloseable` 接口,在 `Main` 中通过 `try-with-resources` 关闭 Scanner。 + +#### 3. 策略工厂匹配逻辑性能审计 (Factory Performance) +**❌ 审计结论:存在显著性能隐患,不适合大规模扩展。** + +* **现状分析**: + * 在 `StrategyFactory.getStrategy()` 中,使用了 `strategies.stream().sorted(...).filter(...)`。 + * **性能隐患**:每次调用 `getStrategy`(即每次爬虫请求)都会执行一次 **排序操作 (O(N log N))**。如果网站数量增加到 100 个,且用户频繁调用 `crawl` 命令,这会带来不必要的 CPU 开销。 + * **匹配逻辑**:目前依赖 `supports()` 的线性遍历。如果 URL 规则复杂,正则匹配本身也有开销。 +* **改进建议**: + 1. **预排序 (Pre-sorting)**: + * **方案**:在 `StrategyFactory` 的 **构造函数** 中完成排序,而不是在 `getStrategy` 方法中。 + * **代码**: + ```java + public StrategyFactory() { + strategies.add(new HnuNewsStrategy()); + // ... + Collections.sort(strategies, (s1, s2) -> Integer.compare(s2.getPriority(), s1.getPriority())); + } + ``` + 2. **路由优化 (Routing Optimization)**: + * 如果网站数量超过 50 个,线性遍历 (`O(N)`) 效率会下降。 + * **方案**:使用 `Map` 进行前缀匹配,或使用 **Trie 树 (字典树)** 存储 URL 域名前缀。 + * **示例**:将 `blog.example.com` 和 `news.hnu.edu.cn` 作为 Key 存入 Map,匹配时直接 `map.get(url)`,将复杂度降为 `O(1)`。 + 3. **正则缓存**:确保 `Pattern` 对象是 `static final` 的(当前代码已做到),避免重复编译正则表达式。 + +--- + +### 🚀 综合改进建议清单 (Action Items) + +| 优先级 | 模块 | 问题描述 | 改进方案 | +| :--- | :--- | :--- | :--- | +| **P0** | `StrategyFactory` | `getStrategy` 中每次调用都排序 | **移至构造函数排序**,消除 O(N log N) 重复计算。 | +| **P0** | `Article` | 存在 Setter 方法,数据可被篡改 | **移除 Setter**,改为全参构造函数,实现不可变对象。 | +| **P1** | `ArticleRepository` | `ArrayList` 非线程安全 | 若需并发,改为 `CopyOnWriteArrayList` 或加锁。 | +| **P1** | `ConsoleView` | `Scanner` 未关闭 | 实现 `AutoCloseable` 并在 `Main` 中关闭。 | +| **P2** | `StrategyFactory` | 100+ 网站时线性匹配慢 | 引入 **URL 前缀映射表** 或 **Trie 树** 优化匹配。 | +| **P3** | `CrawlCommand` | 异常捕获过宽 | 区分 `IOException` (网络) 和 `ParserException` (解析)。 | + +### 📝 审计总结 +当前架构在 **设计模式** 和 **分层解耦** 上做得非常出色,特别是策略模式与命令模式的结合,使得系统具备极强的扩展性。 + +**最大的瓶颈在于 `StrategyFactory` 的运行时性能**(每次调用都排序)和 **数据模型的封装性**(`Article` 可变)。建议优先修复这两个问题,即可支撑从“教学 Demo"到“生产级爬虫系统”的跨越。 \ No newline at end of file diff --git a/w10/datacollect/Main.java b/w10/datacollect/Main.java new file mode 100644 index 0000000..d179115 --- /dev/null +++ b/w10/datacollect/Main.java @@ -0,0 +1,21 @@ +package com.example.datacollect; + +import com.example.datacollect.controller.CrawlerController; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; + +public class Main { + + public static void main(String[] args) { + ConsoleView view = new ConsoleView(); + ArticleRepository repository = new ArticleRepository(); + StrategyFactory strategyFactory = new StrategyFactory(); + CrawlerController controller = new CrawlerController(view, repository, strategyFactory); + + view.printSuccess("Welcome to CLI Crawler (w10_3)! Type help for commands."); + while (true) { + controller.handle(view.readLine()); + } + } +} diff --git a/w10/datacollect/command/AnalyzeCommand.java b/w10/datacollect/command/AnalyzeCommand.java new file mode 100644 index 0000000..0f06cbc --- /dev/null +++ b/w10/datacollect/command/AnalyzeCommand.java @@ -0,0 +1,65 @@ +package com.example.datacollect.command; +import com.example.datacollect.model.Article; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import java.util.List; +import java.util.stream.Collectors; + +public class AnalyzeCommand implements Command{ + private final ConsoleView view; + private final StrategyFactory strategyFactory; + public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + @Override + public String getName() { + return "analyze"; + } + @Override + public void execute(String []args,ArticleRepository repository){ + if(args.length<2){ + view.printError("Usage:analyze "); + } + String url=args[1]; + CrawlStrategy strategy=strategyFactory.getStrategy(url); + if (strategy == null){ + view.printError("No strategy found for: "+url); + return; + } + try{ + view.printInfo("Analyzing: "+url); + Document doc=Jsoup.connect(url).get(); + //调用策略解析,但不存入Repository + List
articles=strategy.parse(url,doc); + //统计信息 + int total=articles.size(); + double avgTitleLen=articles.stream() + .mapToInt(a -> a.getTitle().length()) + .average() + .orElse(0.0); + //Top 5 按标题长度排序 + List
top5 =articles.stream() + .sorted((a,b) -> Integer.compare(b.getTitle().length(),a.getTitle().length())) + .limit(5) + .collect(Collectors.toList()); + //输出结果 + view.printInfo("=== Analysis Result ==="); + view.printInfo("Total Articles: " + total); + view.printInfo("Avg Title Length: " + String.format("%.2f", avgTitleLen)); + view.printInfo("Top 5 Articles (by Title Length):"); + int rank = 1; + for (Article a : top5) { + view.printInfo(rank + ". " + a.getTitle() + " (" + a.getTitle().length() + " chars)"); + rank++; + } + view.printInfo("========================"); + } catch (Exception e){ + view.printError("Failed to analyze: "+e.getMessage()); + } + } +} diff --git a/w10/datacollect/command/Command.java b/w10/datacollect/command/Command.java new file mode 100644 index 0000000..029cadc --- /dev/null +++ b/w10/datacollect/command/Command.java @@ -0,0 +1,8 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; + +public interface Command { + String getName(); + void execute(String[] args, ArticleRepository repository); +} diff --git a/w10/datacollect/command/CrawlCommand.java b/w10/datacollect/command/CrawlCommand.java new file mode 100644 index 0000000..0841d57 --- /dev/null +++ b/w10/datacollect/command/CrawlCommand.java @@ -0,0 +1,50 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +public class CrawlCommand implements Command { + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "crawl"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + view.printError("Usage: crawl "); + return; + } + String url = args[1]; + + CrawlStrategy strategy = strategyFactory.getStrategy(url); + if (strategy == null) { + view.printError("No strategy found for: " + url); + return; + } + + try { + view.printInfo("Crawling: " + url); + Document doc = Jsoup.connect(url).get(); + var articles = strategy.parse(url, doc); + for (var article : articles) { + repository.add(article); + } + view.printSuccess("Crawled " + articles.size() + " articles."); + } catch (Exception e) { + view.printError("Failed to crawl: " + e.getMessage()); + } + } +} diff --git a/w10/datacollect/command/ExitCommand.java b/w10/datacollect/command/ExitCommand.java new file mode 100644 index 0000000..eafcd1d --- /dev/null +++ b/w10/datacollect/command/ExitCommand.java @@ -0,0 +1,23 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +public class ExitCommand implements Command { + private final ConsoleView view; + + public ExitCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "exit"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + view.printSuccess("Bye!"); + System.exit(0); + } +} diff --git a/w10/datacollect/command/HelpCommand.java b/w10/datacollect/command/HelpCommand.java new file mode 100644 index 0000000..dd7a175 --- /dev/null +++ b/w10/datacollect/command/HelpCommand.java @@ -0,0 +1,22 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +public class HelpCommand implements Command { + private final ConsoleView view; + + public HelpCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "help"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + view.printInfo("Commands: crawl , list, help, exit"); + } +} diff --git a/w10/datacollect/command/ListCommand.java b/w10/datacollect/command/ListCommand.java new file mode 100644 index 0000000..8147be8 --- /dev/null +++ b/w10/datacollect/command/ListCommand.java @@ -0,0 +1,22 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +public class ListCommand implements Command { + private final ConsoleView view; + + public ListCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "list"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + view.display(repository.getAll()); + } +} diff --git a/w10/datacollect/controller/CrawlerController.java b/w10/datacollect/controller/CrawlerController.java new file mode 100644 index 0000000..1cbd130 --- /dev/null +++ b/w10/datacollect/controller/CrawlerController.java @@ -0,0 +1,49 @@ +package com.example.datacollect.controller; + +import com.example.datacollect.command.Command; +import com.example.datacollect.command.CrawlCommand; +import com.example.datacollect.command.ExitCommand; +import com.example.datacollect.command.HelpCommand; +import com.example.datacollect.command.ListCommand; +import com.example.datacollect.command.AnalyzeCommand; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import java.util.HashMap; +import java.util.Map; + +public class CrawlerController { + private final Map commands = new HashMap<>(); + private final ConsoleView view; + private final ArticleRepository repository; + + public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) { + this.view = view; + this.repository = repository; + register(new HelpCommand(view)); + register(new ListCommand(view)); + register(new CrawlCommand(view, strategyFactory)); + register(new AnalyzeCommand(view,strategyFactory));//新增 + register(new ExitCommand(view)); + } + + private void register(Command command) { + commands.put(command.getName(), command); + } + + public void handle(String input) { + String text = input == null ? "" : input.trim(); + if (text.isEmpty()) { + return; + } + + String[] args = text.split("\\s+"); + String cmdName = args[0].toLowerCase(); + Command command = commands.get(cmdName); + if (command == null) { + view.printError("Unknown command: " + cmdName); + return; + } + command.execute(args, repository); + } +} diff --git a/w10/datacollect/model/Article.java b/w10/datacollect/model/Article.java new file mode 100644 index 0000000..147dbe6 --- /dev/null +++ b/w10/datacollect/model/Article.java @@ -0,0 +1,45 @@ +package com.example.datacollect.model; + +public class Article { + private String title; + private String url; + private String content; + + public Article(String title, String url, String content) { + this.title = title; + this.url = url; + this.content = content; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + @Override + public String toString() { + return "Article{" + + "title='" + title + '\'' + + ", url='" + url + '\'' + + '}'; + } +} diff --git a/w10/datacollect/repository/ArticleRepository.java b/w10/datacollect/repository/ArticleRepository.java new file mode 100644 index 0000000..cb232ae --- /dev/null +++ b/w10/datacollect/repository/ArticleRepository.java @@ -0,0 +1,40 @@ +package com.example.datacollect.repository; + +import com.example.datacollect.model.Article; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class ArticleRepository { + private final List
articles = new ArrayList<>(); + + public void add(Article article) { + if (article == null) { + throw new IllegalArgumentException("Article cannot be null"); + } + articles.add(article); + } + // ★ 新增:批量添加方法以及注意防御 null + public void addAll(List
articles) { + if (articles == null) { + throw new IllegalArgumentException("Articles list cannot be null"); + } + for (Article article : articles) { + if (article == null) { + throw new IllegalArgumentException("Article in list cannot be null"); + } + this.articles.add(article); + } + } + public List
getAll() { + return Collections.unmodifiableList(articles); + } + + public int size() { + return articles.size(); + } + + public void clear() { + articles.clear(); + } +} diff --git a/w10/datacollect/strategy/BlogStrategy.java b/w10/datacollect/strategy/BlogStrategy.java new file mode 100644 index 0000000..d70eff5 --- /dev/null +++ b/w10/datacollect/strategy/BlogStrategy.java @@ -0,0 +1,30 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; +public class BlogStrategy implements CrawlStrategy { + private static final Pattern URL_PATTERN = Pattern.compile(".*blog\\.example\\.com.*"); + @Override + public boolean supports(String url) { + return URL_PATTERN.matcher(url).matches(); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements titles = doc.select(".post-title"); + for (Element e : titles) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } + @Override + public int getPriority(){ + return 10;//优先级高于默认策略 + } +} diff --git a/w10/datacollect/strategy/CrawlStrategy.java b/w10/datacollect/strategy/CrawlStrategy.java new file mode 100644 index 0000000..d4af942 --- /dev/null +++ b/w10/datacollect/strategy/CrawlStrategy.java @@ -0,0 +1,14 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import java.util.List; + +public interface CrawlStrategy { + List
parse(String url, Document doc); + boolean supports(String url); + //增加优先级 + default int getPriority(){ + return 0; + } +} diff --git a/w10/datacollect/strategy/DefaultStrategy.java b/w10/datacollect/strategy/DefaultStrategy.java new file mode 100644 index 0000000..0d4ba7e --- /dev/null +++ b/w10/datacollect/strategy/DefaultStrategy.java @@ -0,0 +1,27 @@ +package com.example.datacollect.strategy; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; +public class DefaultStrategy implements CrawlStrategy{ + @Override + public boolean supports(String url) { + return true; // 兜底策略 + } + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + // 通用逻辑:提取所有 h1 或 h2 作为标题 + Elements titles = doc.select("h1, h2"); + for (Element e : titles) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } + @Override + public int getPriority() { + return -1; // 优先级最低 + } +} diff --git a/w10/datacollect/strategy/HnuNewsStrategy.java b/w10/datacollect/strategy/HnuNewsStrategy.java new file mode 100644 index 0000000..09e6168 --- /dev/null +++ b/w10/datacollect/strategy/HnuNewsStrategy.java @@ -0,0 +1,55 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class HnuNewsStrategy implements CrawlStrategy { + // 使用正则匹配,更灵活 + private static final Pattern URL_PATTERN = Pattern.compile(".*news\\.hnu\\.edu\\.cn.*"); + @Override + public boolean supports(String url) { + return URL_PATTERN.matcher(url).matches(); + } + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements listItems = doc.select("ul.list11 li"); + + for (Element li : listItems) { + Element link = li.selectFirst("a"); + if (link == null) continue; + + String articleUrl = link.attr("href"); + if (!articleUrl.startsWith("http")) { + articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", ""); + } + + String title = ""; + Element titleEl = link.selectFirst("h4.l2.h4s2"); + if (titleEl != null) { + title = titleEl.text().trim(); + } + + String content = ""; + Element contentEl = link.selectFirst("p.l3.ps3"); + if (contentEl != null) { + content = contentEl.text().trim(); + } + + if (!title.isEmpty()) { + articles.add(new Article(title, articleUrl, content)); + } + } + + return articles; + } + @Override + public int getPriority(){ + return 15; + } +} diff --git a/w10/datacollect/strategy/NewsStrategy.java b/w10/datacollect/strategy/NewsStrategy.java new file mode 100644 index 0000000..f9f3404 --- /dev/null +++ b/w10/datacollect/strategy/NewsStrategy.java @@ -0,0 +1,32 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class NewsStrategy implements CrawlStrategy { + // 使用正则匹配 + private static final Pattern URL_PATTERN = Pattern.compile(".*news\\.example\\.com.*"); + @Override + public boolean supports(String url) { + return URL_PATTERN.matcher(url).matches(); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements items = doc.select(".article-headline"); + for (Element e : items) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } + @Override + public int getPriority(){ + return 10; + } +} diff --git a/w10/datacollect/strategy/StrategyFactory.java b/w10/datacollect/strategy/StrategyFactory.java new file mode 100644 index 0000000..0a0a643 --- /dev/null +++ b/w10/datacollect/strategy/StrategyFactory.java @@ -0,0 +1,29 @@ +package com.example.datacollect.strategy; + +import java.util.ArrayList; +import java.util.List; + +public class StrategyFactory { + private final List strategies = new ArrayList<>(); + + public StrategyFactory() { + strategies.add(new HnuNewsStrategy()); + strategies.add(new BlogStrategy()); + strategies.add(new NewsStrategy()); + //注册默认策略 + strategies.add(new DefaultStrategy()); + } + + public CrawlStrategy getStrategy(String url) { + //按优先级降序排序 + return strategies.stream() + .sorted((s1, s2) -> Integer.compare(s2.getPriority(), s1.getPriority())) + .filter(s -> s.supports(url)) + .findFirst() + .orElse(null); // 如果默认策略未匹配到,返回 null 或默认策略本身 + } + + public void register(CrawlStrategy strategy) { + strategies.add(strategy); + } +} diff --git a/w10/datacollect/view/ConsoleView.java b/w10/datacollect/view/ConsoleView.java new file mode 100644 index 0000000..3c1d47a --- /dev/null +++ b/w10/datacollect/view/ConsoleView.java @@ -0,0 +1,42 @@ +package com.example.datacollect.view; + +import com.example.datacollect.model.Article; +import java.util.List; +import java.util.Scanner; + +public class ConsoleView { + private static final String ANSI_RESET = "\u001B[0m"; + private static final String ANSI_GREEN = "\u001B[32m"; + private static final String ANSI_RED = "\u001B[31m"; + private static final String ANSI_BLUE = "\u001B[34m"; + + private final Scanner scanner = new Scanner(System.in); + + public String readLine() { + System.out.print("> "); + return scanner.nextLine(); + } + + public void printSuccess(String msg) { + System.out.println(ANSI_GREEN + msg + ANSI_RESET); + } + + public void printError(String msg) { + System.out.println(ANSI_RED + msg + ANSI_RESET); + } + + public void printInfo(String msg) { + System.out.println(ANSI_BLUE + msg + ANSI_RESET); + } + + public void display(List
articles) { + if (articles.isEmpty()) { + printInfo("暂无文章,请先执行 crawl。"); + return; + } + for (int i = 0; i < articles.size(); i++) { + Article a = articles.get(i); + System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl()); + } + } +} diff --git a/w10/类图.svg b/w10/类图.svg new file mode 100644 index 0000000..2bebc09 --- /dev/null +++ b/w10/类图.svg @@ -0,0 +1 @@ +

注册/管理

使用

使用

使用 (通过 Command 间接)

使用

使用

使用

使用

使用

仅读取/不存

使用

读取

聚合 (List)

返回 (List)

聚合 (List)

显示

实例化

实例化

实例化

实例化

«interface»

Command

+String getName()

+void execute(String[] args, ArticleRepository repository)

«interface»

CrawlStrategy

+List<Article> parse(String url, Document doc)

+boolean supports(String url)

+int getPriority()

CrawlCommand

-ConsoleView view

-StrategyFactory strategyFactory

+execute(String[] args, ArticleRepository repository)

AnalyzeCommand

-ConsoleView view

-StrategyFactory strategyFactory

+execute(String[] args, ArticleRepository repository)

ListCommand

-ConsoleView view

+execute(String[] args, ArticleRepository repository)

HelpCommand

-ConsoleView view

+execute(String[] args, ArticleRepository repository)

ExitCommand

-ConsoleView view

+execute(String[] args, ArticleRepository repository)

CrawlerController

-Map<String, Command> commands

-ConsoleView view

-ArticleRepository repository

+handle(String input)

ArticleRepository

-List<Article> articles

+void add(Article article)

+void addAll(List<Article> articles)

+List<Article> getAll()

+int size()

+void clear()

Article

-String title

-String url

-String content

+Article(String title, String url, String content)

+String getTitle()

+String getUrl()

+String getContent()

ConsoleView

-Scanner scanner

+String readLine()

+void printSuccess(String msg)

+void printError(String msg)

+void printInfo(String msg)

+void display(List<Article> articles)

StrategyFactory

-List<CrawlStrategy> strategies

+StrategyFactory()

+CrawlStrategy getStrategy(String url)

+void register(CrawlStrategy strategy)

HnuNewsStrategy

+boolean supports(String url)

+List<Article> parse(String url, Document doc)

+int getPriority()

NewsStrategy

+boolean supports(String url)

+List<Article> parse(String url, Document doc)

+int getPriority()

BlogStrategy

+boolean supports(String url)

+List<Article> parse(String url, Document doc)

+int getPriority()

DefaultStrategy

+boolean supports(String url)

+List<Article> parse(String url, Document doc)

+int getPriority()

Main

包含 getPriority() 用于排序

包含 addAll() 批量添加

仅分析统计,不存储

\ No newline at end of file