From 3d34bb3ed1a18f625cb7e2b38b2ef6246397017e Mon Sep 17 00:00:00 2001 From: ZhengJiayin <13230092115@163.com> Date: Wed, 13 May 2026 21:12:18 +0800 Subject: [PATCH] w10 --- w10/Main.java | 21 ++++ w10/README.md | 144 ++++++++++++++++++++++++++ w10/command/AnalyzeCommand.java | 87 ++++++++++++++++ w10/command/Command.java | 8 ++ w10/command/CrawlCommand.java | 50 +++++++++ w10/command/ExitCommand.java | 23 ++++ w10/command/HelpCommand.java | 28 +++++ w10/command/HistoryCommand.java | 33 ++++++ w10/command/ListCommand.java | 22 ++++ w10/controller/CrawlerController.java | 56 ++++++++++ w10/model/Article.java | 73 +++++++++++++ w10/repository/ArticleRepository.java | 41 ++++++++ w10/strategy/BlogStrategy.java | 25 +++++ w10/strategy/CrawlStrategy.java | 19 ++++ w10/strategy/GenericNewsStrategy.java | 57 ++++++++++ w10/strategy/HnuNewsStrategy.java | 49 +++++++++ w10/strategy/NewsStrategy.java | 25 +++++ w10/strategy/StrategyFactory.java | 107 +++++++++++++++++++ w10/view/ConsoleView.java | 42 ++++++++ 19 files changed, 910 insertions(+) create mode 100644 w10/Main.java create mode 100644 w10/README.md create mode 100644 w10/command/AnalyzeCommand.java create mode 100644 w10/command/Command.java create mode 100644 w10/command/CrawlCommand.java create mode 100644 w10/command/ExitCommand.java create mode 100644 w10/command/HelpCommand.java create mode 100644 w10/command/HistoryCommand.java create mode 100644 w10/command/ListCommand.java create mode 100644 w10/controller/CrawlerController.java create mode 100644 w10/model/Article.java create mode 100644 w10/repository/ArticleRepository.java create mode 100644 w10/strategy/BlogStrategy.java create mode 100644 w10/strategy/CrawlStrategy.java create mode 100644 w10/strategy/GenericNewsStrategy.java create mode 100644 w10/strategy/HnuNewsStrategy.java create mode 100644 w10/strategy/NewsStrategy.java create mode 100644 w10/strategy/StrategyFactory.java create mode 100644 w10/view/ConsoleView.java diff --git a/w10/Main.java b/w10/Main.java new file mode 100644 index 0000000..0f98466 --- /dev/null +++ b/w10/Main.java @@ -0,0 +1,21 @@ +package com.example.datacollect; + +import com.example.datacollect.controller.CrawlerController; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; + +public class Main { + + public static void main(String[] args) { + ConsoleView view = new ConsoleView(); + ArticleRepository repository = new ArticleRepository(); + StrategyFactory strategyFactory = new StrategyFactory(); + CrawlerController controller = new CrawlerController(view, repository, strategyFactory); + + view.printSuccess("Welcome to CLI Crawler (w10_3)! Type help for commands."); + while (true) { + controller.handle(view.readLine()); + } + } +} \ No newline at end of file diff --git a/w10/README.md b/w10/README.md new file mode 100644 index 0000000..0c8bb27 --- /dev/null +++ b/w10/README.md @@ -0,0 +1,144 @@ +# W10 作业提交:设计模式实战 + +## 目录结构 + +``` +w10/ +└── src/ + └── main/ + └── java/ + └── com/ + └── example/ + └── datacollect/ + ├── Main.java + ├── command/ + │ ├── Command.java + │ ├── CrawlCommand.java + │ ├── AnalyzeCommand.java + │ ├── ListCommand.java + │ ├── HelpCommand.java + │ ├── ExitCommand.java + │ └── HistoryCommand.java + ├── controller/ + │ └── CrawlerController.java + ├── model/ + │ └── Article.java + ├── repository/ + │ └── ArticleRepository.java + ├── strategy/ + │ ├── CrawlStrategy.java + │ ├── StrategyFactory.java + │ ├── HnuNewsStrategy.java + │ ├── BlogStrategy.java + │ ├── NewsStrategy.java + │ └── GenericNewsStrategy.java + └── view/ + └── ConsoleView.java +``` + +## 必做任务完成情况 + +### 1. ArticleRepository 完善 ✅ +- `add()`: 拒绝 null,抛出 IllegalArgumentException +- `addAll()`: 拒绝 null 列表和列表中的 null 元素 +- `getAll()`: 返回 `Collections.unmodifiableList()` 不可变视图 +- `size()`: 返回文章数量 +- `clear()`: 清空所有文章 + +### 2. AnalyzeCommand ✅ +- 复用策略解析但**不存储**到 Repository +- 输出统计信息:文章总数、含作者/日期/内容的数量、使用的策略名称 +- 显示前 3 篇文章标题作为预览 + +### 3. AI 架构审计 ✅ + +#### 类签名汇总 + +```java +// Command 层 +interface Command { void execute(String[], ArticleRepository); } +class CrawlCommand(ConsoleView, StrategyFactory) +class AnalyzeCommand(ConsoleView, StrategyFactory) +class ListCommand(ConsoleView) +class HelpCommand(ConsoleView) +class ExitCommand(ConsoleView) +class HistoryCommand(ConsoleView, List) + +// Controller 层 +class CrawlerController(ConsoleView, ArticleRepository, StrategyFactory) + +// Repository 层 +class ArticleRepository { add(), addAll(), getAll(), size(), clear() } + +// Strategy 层 +interface CrawlStrategy { parse(), supports(), getPriority(), getPattern() } +class StrategyFactory { getStrategy(url), register(), setDefaultStrategy() } +class HnuNewsStrategy implements CrawlStrategy +class BlogStrategy implements CrawlStrategy +class NewsStrategy implements CrawlStrategy +class GenericNewsStrategy implements CrawlStrategy (正则匹配) + +// Model 层 +class Article { title, url, content, author, publishDate } + +// View 层 +class ConsoleView +``` + +#### 架构审计结果 + +| 检查项 | 结果 | 说明 | +|--------|------|------| +| **策略解耦** | ✅ 优秀 | 策略接口与实现完全分离 | +| **Repository 封装** | ✅ 优秀 | 使用不可变视图 + null 防御 | +| **开闭原则** | ✅ 达标 | 新增网站只需加策略类 + 注册一行 | +| **依赖倒置** | ✅ 良好 | Command/Strategy 依赖抽象接口 | +| **单一职责** | ✅ 达标 | 每个类职责清晰 | +| **循环依赖** | ✅ 无 | 依赖链单向 | + +## 选做任务完成情况 + +### 正则策略匹配 ✅ +- `GenericNewsStrategy` 使用正则表达式 `.*\.(news|press|article)s?\..*` 匹配新闻类网站 + +### 默认策略 ✅ +- `StrategyFactory` 内置 `DefaultStrategy`,当没有匹配策略时返回空列表 + +### 策略优先级 ✅ +- `CrawlStrategy` 接口新增 `getPriority()` 默认方法 +- `GenericNewsStrategy` 设置优先级为 5(高于默认优先级 1) +- `StrategyFactory.getStrategy()` 遍历所有策略,选择优先级最高的匹配策略 + +### 思考题答案 + +**Q: 两个策略都 supports 同一 URL 时怎么办?** + +**A:** 采用**优先级机制**解决: + +1. 每个策略实现可以通过 `getPriority()` 返回优先级值 +2. `StrategyFactory.getStrategy()` 遍历所有策略时,记录最高优先级 +3. 如果多个策略都支持同一 URL,选择优先级最高的那个 +4. 如果优先级相同,选择最先注册的策略(遍历顺序决定) + +这种设计的优势: +- 允许通用策略(如 `GenericNewsStrategy`)和专用策略(如 `HnuNewsStrategy`)共存 +- 专用策略可设置更高优先级,确保精确匹配优先 +- 通用策略作为兜底,提高系统兼容性 + +## 命令功能对比 + +| 命令 | 功能 | 是否存储 | +|------|------|----------| +| `crawl ` | 爬取并存储文章 | ✅ 是 | +| `analyze ` | 分析文章统计(不存储) | ❌ 否 | +| `list` | 列出已存储文章 | - | +| `history` | 显示命令历史 | - | +| `help` | 显示帮助 | - | +| `exit` | 退出程序 | - | + +## 设计模式应用 + +1. **策略模式**:`CrawlStrategy` 接口定义标准,各策略独立实现 +2. **工厂模式**:`StrategyFactory` 根据 URL 自动选择策略 +3. **Repository 模式**:数据访问封装,防御式编程 +4. **命令模式**:所有 Command 统一签名,易于扩展 \ No newline at end of file diff --git a/w10/command/AnalyzeCommand.java b/w10/command/AnalyzeCommand.java new file mode 100644 index 0000000..913e4a9 --- /dev/null +++ b/w10/command/AnalyzeCommand.java @@ -0,0 +1,87 @@ +package com.example.datacollect.command; + +import com.example.datacollect.model.Article; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.util.List; + +public class AnalyzeCommand implements Command { + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "analyze"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + view.printError("Usage: analyze "); + return; + } + String url = args[1]; + + CrawlStrategy strategy = strategyFactory.getStrategy(url); + if (strategy == null) { + view.printError("No strategy found for: " + url); + return; + } + + try { + view.printInfo("Analyzing: " + url); + Document doc = Jsoup.connect(url).get(); + List
parsed = strategy.parse(url, doc); + + view.printInfo("=== Analysis Report ==="); + view.printInfo("Total articles found: " + parsed.size()); + + int titlesWithAuthor = 0; + int titlesWithDate = 0; + int titlesWithContent = 0; + + for (Article article : parsed) { + if (article.getAuthor() != null && !article.getAuthor().isEmpty()) { + titlesWithAuthor++; + } + if (article.getPublishDate() != null && !article.getPublishDate().isEmpty()) { + titlesWithDate++; + } + if (article.getContent() != null && !article.getContent().isEmpty()) { + titlesWithContent++; + } + } + + view.printInfo("Articles with author: " + titlesWithAuthor); + view.printInfo("Articles with publish date: " + titlesWithDate); + view.printInfo("Articles with content: " + titlesWithContent); + view.printInfo("Strategy used: " + strategy.getClass().getSimpleName()); + + if (parsed.size() > 0) { + view.printInfo("\nSample article titles:"); + int limit = Math.min(3, parsed.size()); + for (int i = 0; i < limit; i++) { + view.printInfo("- " + parsed.get(i).getTitle()); + } + if (parsed.size() > 3) { + view.printInfo("... and " + (parsed.size() - 3) + " more"); + } + } + + view.printSuccess("Analysis completed (not stored)"); + + } catch (Exception e) { + view.printError("Failed to analyze: " + e.getMessage()); + } + } +} \ No newline at end of file diff --git a/w10/command/Command.java b/w10/command/Command.java new file mode 100644 index 0000000..e3e2030 --- /dev/null +++ b/w10/command/Command.java @@ -0,0 +1,8 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; + +public interface Command { + String getName(); + void execute(String[] args, ArticleRepository repository); +} \ No newline at end of file diff --git a/w10/command/CrawlCommand.java b/w10/command/CrawlCommand.java new file mode 100644 index 0000000..6f9e1a8 --- /dev/null +++ b/w10/command/CrawlCommand.java @@ -0,0 +1,50 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +public class CrawlCommand implements Command { + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "crawl"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + view.printError("Usage: crawl "); + return; + } + String url = args[1]; + + CrawlStrategy strategy = strategyFactory.getStrategy(url); + if (strategy == null) { + view.printError("No strategy found for: " + url); + return; + } + + try { + view.printInfo("Crawling: " + url); + Document doc = Jsoup.connect(url).get(); + var articles = strategy.parse(url, doc); + for (var article : articles) { + repository.add(article); + } + view.printSuccess("Crawled " + articles.size() + " articles."); + } catch (Exception e) { + view.printError("Failed to crawl: " + e.getMessage()); + } + } +} \ No newline at end of file diff --git a/w10/command/ExitCommand.java b/w10/command/ExitCommand.java new file mode 100644 index 0000000..15c2f00 --- /dev/null +++ b/w10/command/ExitCommand.java @@ -0,0 +1,23 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +public class ExitCommand implements Command { + private final ConsoleView view; + + public ExitCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "exit"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + view.printSuccess("Bye!"); + System.exit(0); + } +} \ No newline at end of file diff --git a/w10/command/HelpCommand.java b/w10/command/HelpCommand.java new file mode 100644 index 0000000..ec3ff87 --- /dev/null +++ b/w10/command/HelpCommand.java @@ -0,0 +1,28 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +public class HelpCommand implements Command { + private final ConsoleView view; + + public HelpCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "help"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + view.printInfo("Commands:"); + view.printInfo(" crawl - Crawl articles from URL and store"); + view.printInfo(" analyze - Analyze URL without storing"); + view.printInfo(" list - List all stored articles"); + view.printInfo(" history - Show command history"); + view.printInfo(" help - Show this help"); + view.printInfo(" exit - Exit the program"); + } +} \ No newline at end of file diff --git a/w10/command/HistoryCommand.java b/w10/command/HistoryCommand.java new file mode 100644 index 0000000..7baf7bd --- /dev/null +++ b/w10/command/HistoryCommand.java @@ -0,0 +1,33 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +import java.util.List; + +public class HistoryCommand implements Command { + private final ConsoleView view; + private final List commandHistory; + + public HistoryCommand(ConsoleView view, List commandHistory) { + this.view = view; + this.commandHistory = commandHistory; + } + + @Override + public String getName() { + return "history"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (commandHistory.isEmpty()) { + view.printInfo("No command history."); + return; + } + view.printInfo("Command history:"); + for (int i = 0; i < commandHistory.size(); i++) { + view.printInfo((i + 1) + ". " + commandHistory.get(i)); + } + } +} \ No newline at end of file diff --git a/w10/command/ListCommand.java b/w10/command/ListCommand.java new file mode 100644 index 0000000..29b3fc0 --- /dev/null +++ b/w10/command/ListCommand.java @@ -0,0 +1,22 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +public class ListCommand implements Command { + private final ConsoleView view; + + public ListCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "list"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + view.display(repository.getAll()); + } +} \ No newline at end of file diff --git a/w10/controller/CrawlerController.java b/w10/controller/CrawlerController.java new file mode 100644 index 0000000..e373dd8 --- /dev/null +++ b/w10/controller/CrawlerController.java @@ -0,0 +1,56 @@ +package com.example.datacollect.controller; + +import com.example.datacollect.command.AnalyzeCommand; +import com.example.datacollect.command.Command; +import com.example.datacollect.command.CrawlCommand; +import com.example.datacollect.command.ExitCommand; +import com.example.datacollect.command.HelpCommand; +import com.example.datacollect.command.HistoryCommand; +import com.example.datacollect.command.ListCommand; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class CrawlerController { + private final Map commands = new HashMap<>(); + private final ConsoleView view; + private final ArticleRepository repository; + private final List commandHistory = new ArrayList<>(); + + public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) { + this.view = view; + this.repository = repository; + register(new HelpCommand(view)); + register(new ListCommand(view)); + register(new CrawlCommand(view, strategyFactory)); + register(new AnalyzeCommand(view, strategyFactory)); + register(new ExitCommand(view)); + register(new HistoryCommand(view, commandHistory)); + } + + private void register(Command command) { + commands.put(command.getName(), command); + } + + public void handle(String input) { + String text = input == null ? "" : input.trim(); + if (text.isEmpty()) { + return; + } + + commandHistory.add(text); + + String[] args = text.split("\\s+"); + String cmdName = args[0].toLowerCase(); + Command command = commands.get(cmdName); + if (command == null) { + view.printError("Unknown command: " + cmdName); + return; + } + command.execute(args, repository); + } +} \ No newline at end of file diff --git a/w10/model/Article.java b/w10/model/Article.java new file mode 100644 index 0000000..b36034b --- /dev/null +++ b/w10/model/Article.java @@ -0,0 +1,73 @@ +package com.example.datacollect.model; + +public class Article { + private String title; + private String url; + private String content; + private String author; + private String publishDate; + + public Article(String title, String url, String content) { + this.title = title; + this.url = url; + this.content = content; + } + + public Article(String title, String url, String content, String author, String publishDate) { + this.title = title; + this.url = url; + this.content = content; + this.author = author; + this.publishDate = publishDate; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + public String getAuthor() { + return author; + } + + public void setAuthor(String author) { + this.author = author; + } + + public String getPublishDate() { + return publishDate; + } + + public void setPublishDate(String publishDate) { + this.publishDate = publishDate; + } + + @Override + public String toString() { + return "Article{" + + "title='" + title + '\'' + + ", url='" + url + '\'' + + ", author='" + author + '\'' + + ", publishDate='" + publishDate + '\'' + + '}'; + } +} \ No newline at end of file diff --git a/w10/repository/ArticleRepository.java b/w10/repository/ArticleRepository.java new file mode 100644 index 0000000..930221a --- /dev/null +++ b/w10/repository/ArticleRepository.java @@ -0,0 +1,41 @@ +package com.example.datacollect.repository; + +import com.example.datacollect.model.Article; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class ArticleRepository { + private final List
articles = new ArrayList<>(); + + public void add(Article article) { + if (article == null) { + throw new IllegalArgumentException("Article cannot be null"); + } + articles.add(article); + } + + public void addAll(List
newArticles) { + if (newArticles == null) { + throw new IllegalArgumentException("Article list cannot be null"); + } + for (Article article : newArticles) { + if (article == null) { + throw new IllegalArgumentException("Article in list cannot be null"); + } + articles.add(article); + } + } + + public List
getAll() { + return Collections.unmodifiableList(articles); + } + + public int size() { + return articles.size(); + } + + public void clear() { + articles.clear(); + } +} \ No newline at end of file diff --git a/w10/strategy/BlogStrategy.java b/w10/strategy/BlogStrategy.java new file mode 100644 index 0000000..9ad5e10 --- /dev/null +++ b/w10/strategy/BlogStrategy.java @@ -0,0 +1,25 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class BlogStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("blog.example.com"); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements titles = doc.select(".post-title"); + for (Element e : titles) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} \ No newline at end of file diff --git a/w10/strategy/CrawlStrategy.java b/w10/strategy/CrawlStrategy.java new file mode 100644 index 0000000..0ccdb2e --- /dev/null +++ b/w10/strategy/CrawlStrategy.java @@ -0,0 +1,19 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import java.util.List; +import java.util.regex.Pattern; + +public interface CrawlStrategy { + List
parse(String url, Document doc); + boolean supports(String url); + + default int getPriority() { + return 1; + } + + default Pattern getPattern() { + return null; + } +} \ No newline at end of file diff --git a/w10/strategy/GenericNewsStrategy.java b/w10/strategy/GenericNewsStrategy.java new file mode 100644 index 0000000..c893280 --- /dev/null +++ b/w10/strategy/GenericNewsStrategy.java @@ -0,0 +1,57 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class GenericNewsStrategy implements CrawlStrategy { + private static final Pattern PATTERN = Pattern.compile(".*\\.(news|press|article)s?\\..*"); + private static final int PRIORITY = 5; + + @Override + public boolean supports(String url) { + return PATTERN.matcher(url).find(); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + + Elements items = doc.select("article, .news-item, .article-item, [class*='news'], [class*='article']"); + + for (Element item : items) { + String title = item.selectFirst("h1, h2, h3, .title, [class*='title']") != null + ? item.selectFirst("h1, h2, h3, .title, [class*='title']").text().trim() + : ""; + + String articleUrl = item.selectFirst("a[href]") != null + ? item.selectFirst("a[href]").attr("abs:href") + : url; + + String content = item.selectFirst("p, .content, [class*='content']") != null + ? item.selectFirst("p, .content, [class*='content']").text().trim() + : ""; + + if (!title.isEmpty()) { + articles.add(new Article(title, articleUrl, content)); + } + } + + return articles; + } + + @Override + public int getPriority() { + return PRIORITY; + } + + @Override + public Pattern getPattern() { + return PATTERN; + } +} \ No newline at end of file diff --git a/w10/strategy/HnuNewsStrategy.java b/w10/strategy/HnuNewsStrategy.java new file mode 100644 index 0000000..1204e4d --- /dev/null +++ b/w10/strategy/HnuNewsStrategy.java @@ -0,0 +1,49 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class HnuNewsStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("news.hnu.edu.cn"); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements listItems = doc.select("ul.list11 li"); + + for (Element li : listItems) { + Element link = li.selectFirst("a"); + if (link == null) continue; + + String articleUrl = link.attr("href"); + if (!articleUrl.startsWith("http")) { + articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", ""); + } + + String title = ""; + Element titleEl = link.selectFirst("h4.l2.h4s2"); + if (titleEl != null) { + title = titleEl.text().trim(); + } + + String content = ""; + Element contentEl = link.selectFirst("p.l3.ps3"); + if (contentEl != null) { + content = contentEl.text().trim(); + } + + if (!title.isEmpty()) { + articles.add(new Article(title, articleUrl, content)); + } + } + + return articles; + } +} \ No newline at end of file diff --git a/w10/strategy/NewsStrategy.java b/w10/strategy/NewsStrategy.java new file mode 100644 index 0000000..7bfb888 --- /dev/null +++ b/w10/strategy/NewsStrategy.java @@ -0,0 +1,25 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class NewsStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("news.example.com"); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements items = doc.select(".article-headline"); + for (Element e : items) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} \ No newline at end of file diff --git a/w10/strategy/StrategyFactory.java b/w10/strategy/StrategyFactory.java new file mode 100644 index 0000000..e4789f5 --- /dev/null +++ b/w10/strategy/StrategyFactory.java @@ -0,0 +1,107 @@ +package com.example.datacollect.strategy; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class StrategyFactory { + private final List strategies = new ArrayList<>(); + private CrawlStrategy defaultStrategy; + + public StrategyFactory() { + strategies.add(new HnuNewsStrategy()); + strategies.add(new BlogStrategy()); + strategies.add(new NewsStrategy()); + strategies.add(new GenericNewsStrategy()); + defaultStrategy = new DefaultStrategy(); + } + + public CrawlStrategy getStrategy(String url) { + CrawlStrategy matched = null; + int highestPriority = Integer.MIN_VALUE; + + for (CrawlStrategy s : strategies) { + boolean supports = false; + + Pattern pattern = s.getPattern(); + if (pattern != null) { + supports = pattern.matcher(url).find(); + } else { + supports = s.supports(url); + } + + if (supports) { + int priority = s.getPriority(); + if (priority > highestPriority) { + highestPriority = priority; + matched = s; + } + } + } + + if (matched != null) { + return matched; + } + + return defaultStrategy; + } + + public void register(CrawlStrategy strategy) { + strategies.add(strategy); + } + + public void register(CrawlStrategy strategy, int priority) { + strategies.add(new PrioritizedStrategy(strategy, priority)); + } + + public void setDefaultStrategy(CrawlStrategy defaultStrategy) { + this.defaultStrategy = defaultStrategy; + } + + private static class PrioritizedStrategy implements CrawlStrategy { + private final CrawlStrategy delegate; + private final int priority; + + public PrioritizedStrategy(CrawlStrategy delegate, int priority) { + this.delegate = delegate; + this.priority = priority; + } + + @Override + public List
parse(String url, Document doc) { + return delegate.parse(url, doc); + } + + @Override + public boolean supports(String url) { + return delegate.supports(url); + } + + @Override + public int getPriority() { + return priority; + } + + @Override + public Pattern getPattern() { + return delegate.getPattern(); + } + } + + private static class DefaultStrategy implements CrawlStrategy { + @Override + public List
parse(String url, Document doc) { + return List.of(); + } + + @Override + public boolean supports(String url) { + return false; + } + + @Override + public int getPriority() { + return Integer.MIN_VALUE; + } + } +} \ No newline at end of file diff --git a/w10/view/ConsoleView.java b/w10/view/ConsoleView.java new file mode 100644 index 0000000..987b617 --- /dev/null +++ b/w10/view/ConsoleView.java @@ -0,0 +1,42 @@ +package com.example.datacollect.view; + +import com.example.datacollect.model.Article; +import java.util.List; +import java.util.Scanner; + +public class ConsoleView { + private static final String ANSI_RESET = "\u001B[0m"; + private static final String ANSI_GREEN = "\u001B[32m"; + private static final String ANSI_RED = "\u001B[31m"; + private static final String ANSI_BLUE = "\u001B[34m"; + + private final Scanner scanner = new Scanner(System.in); + + public String readLine() { + System.out.print("> "); + return scanner.nextLine(); + } + + public void printSuccess(String msg) { + System.out.println(ANSI_GREEN + msg + ANSI_RESET); + } + + public void printError(String msg) { + System.out.println(ANSI_RED + msg + ANSI_RESET); + } + + public void printInfo(String msg) { + System.out.println(ANSI_BLUE + msg + ANSI_RESET); + } + + public void display(List
articles) { + if (articles.isEmpty()) { + printInfo("暂无文章,请先执行 crawl。"); + return; + } + for (int i = 0; i < articles.size(); i++) { + Article a = articles.get(i); + System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl()); + } + } +} \ No newline at end of file