19 changed files with 910 additions and 0 deletions
@ -0,0 +1,21 @@ |
|||||
|
package com.example.datacollect; |
||||
|
|
||||
|
import com.example.datacollect.controller.CrawlerController; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.strategy.StrategyFactory; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
|
||||
|
public class Main { |
||||
|
|
||||
|
public static void main(String[] args) { |
||||
|
ConsoleView view = new ConsoleView(); |
||||
|
ArticleRepository repository = new ArticleRepository(); |
||||
|
StrategyFactory strategyFactory = new StrategyFactory(); |
||||
|
CrawlerController controller = new CrawlerController(view, repository, strategyFactory); |
||||
|
|
||||
|
view.printSuccess("Welcome to CLI Crawler (w10_3)! Type help for commands."); |
||||
|
while (true) { |
||||
|
controller.handle(view.readLine()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,144 @@ |
|||||
|
# W10 作业提交:设计模式实战 |
||||
|
|
||||
|
## 目录结构 |
||||
|
|
||||
|
``` |
||||
|
w10/ |
||||
|
└── src/ |
||||
|
└── main/ |
||||
|
└── java/ |
||||
|
└── com/ |
||||
|
└── example/ |
||||
|
└── datacollect/ |
||||
|
├── Main.java |
||||
|
├── command/ |
||||
|
│ ├── Command.java |
||||
|
│ ├── CrawlCommand.java |
||||
|
│ ├── AnalyzeCommand.java |
||||
|
│ ├── ListCommand.java |
||||
|
│ ├── HelpCommand.java |
||||
|
│ ├── ExitCommand.java |
||||
|
│ └── HistoryCommand.java |
||||
|
├── controller/ |
||||
|
│ └── CrawlerController.java |
||||
|
├── model/ |
||||
|
│ └── Article.java |
||||
|
├── repository/ |
||||
|
│ └── ArticleRepository.java |
||||
|
├── strategy/ |
||||
|
│ ├── CrawlStrategy.java |
||||
|
│ ├── StrategyFactory.java |
||||
|
│ ├── HnuNewsStrategy.java |
||||
|
│ ├── BlogStrategy.java |
||||
|
│ ├── NewsStrategy.java |
||||
|
│ └── GenericNewsStrategy.java |
||||
|
└── view/ |
||||
|
└── ConsoleView.java |
||||
|
``` |
||||
|
|
||||
|
## 必做任务完成情况 |
||||
|
|
||||
|
### 1. ArticleRepository 完善 ✅ |
||||
|
- `add()`: 拒绝 null,抛出 IllegalArgumentException |
||||
|
- `addAll()`: 拒绝 null 列表和列表中的 null 元素 |
||||
|
- `getAll()`: 返回 `Collections.unmodifiableList()` 不可变视图 |
||||
|
- `size()`: 返回文章数量 |
||||
|
- `clear()`: 清空所有文章 |
||||
|
|
||||
|
### 2. AnalyzeCommand ✅ |
||||
|
- 复用策略解析但**不存储**到 Repository |
||||
|
- 输出统计信息:文章总数、含作者/日期/内容的数量、使用的策略名称 |
||||
|
- 显示前 3 篇文章标题作为预览 |
||||
|
|
||||
|
### 3. AI 架构审计 ✅ |
||||
|
|
||||
|
#### 类签名汇总 |
||||
|
|
||||
|
```java |
||||
|
// Command 层 |
||||
|
interface Command { void execute(String[], ArticleRepository); } |
||||
|
class CrawlCommand(ConsoleView, StrategyFactory) |
||||
|
class AnalyzeCommand(ConsoleView, StrategyFactory) |
||||
|
class ListCommand(ConsoleView) |
||||
|
class HelpCommand(ConsoleView) |
||||
|
class ExitCommand(ConsoleView) |
||||
|
class HistoryCommand(ConsoleView, List<String>) |
||||
|
|
||||
|
// Controller 层 |
||||
|
class CrawlerController(ConsoleView, ArticleRepository, StrategyFactory) |
||||
|
|
||||
|
// Repository 层 |
||||
|
class ArticleRepository { add(), addAll(), getAll(), size(), clear() } |
||||
|
|
||||
|
// Strategy 层 |
||||
|
interface CrawlStrategy { parse(), supports(), getPriority(), getPattern() } |
||||
|
class StrategyFactory { getStrategy(url), register(), setDefaultStrategy() } |
||||
|
class HnuNewsStrategy implements CrawlStrategy |
||||
|
class BlogStrategy implements CrawlStrategy |
||||
|
class NewsStrategy implements CrawlStrategy |
||||
|
class GenericNewsStrategy implements CrawlStrategy (正则匹配) |
||||
|
|
||||
|
// Model 层 |
||||
|
class Article { title, url, content, author, publishDate } |
||||
|
|
||||
|
// View 层 |
||||
|
class ConsoleView |
||||
|
``` |
||||
|
|
||||
|
#### 架构审计结果 |
||||
|
|
||||
|
| 检查项 | 结果 | 说明 | |
||||
|
|--------|------|------| |
||||
|
| **策略解耦** | ✅ 优秀 | 策略接口与实现完全分离 | |
||||
|
| **Repository 封装** | ✅ 优秀 | 使用不可变视图 + null 防御 | |
||||
|
| **开闭原则** | ✅ 达标 | 新增网站只需加策略类 + 注册一行 | |
||||
|
| **依赖倒置** | ✅ 良好 | Command/Strategy 依赖抽象接口 | |
||||
|
| **单一职责** | ✅ 达标 | 每个类职责清晰 | |
||||
|
| **循环依赖** | ✅ 无 | 依赖链单向 | |
||||
|
|
||||
|
## 选做任务完成情况 |
||||
|
|
||||
|
### 正则策略匹配 ✅ |
||||
|
- `GenericNewsStrategy` 使用正则表达式 `.*\.(news|press|article)s?\..*` 匹配新闻类网站 |
||||
|
|
||||
|
### 默认策略 ✅ |
||||
|
- `StrategyFactory` 内置 `DefaultStrategy`,当没有匹配策略时返回空列表 |
||||
|
|
||||
|
### 策略优先级 ✅ |
||||
|
- `CrawlStrategy` 接口新增 `getPriority()` 默认方法 |
||||
|
- `GenericNewsStrategy` 设置优先级为 5(高于默认优先级 1) |
||||
|
- `StrategyFactory.getStrategy()` 遍历所有策略,选择优先级最高的匹配策略 |
||||
|
|
||||
|
### 思考题答案 |
||||
|
|
||||
|
**Q: 两个策略都 supports 同一 URL 时怎么办?** |
||||
|
|
||||
|
**A:** 采用**优先级机制**解决: |
||||
|
|
||||
|
1. 每个策略实现可以通过 `getPriority()` 返回优先级值 |
||||
|
2. `StrategyFactory.getStrategy()` 遍历所有策略时,记录最高优先级 |
||||
|
3. 如果多个策略都支持同一 URL,选择优先级最高的那个 |
||||
|
4. 如果优先级相同,选择最先注册的策略(遍历顺序决定) |
||||
|
|
||||
|
这种设计的优势: |
||||
|
- 允许通用策略(如 `GenericNewsStrategy`)和专用策略(如 `HnuNewsStrategy`)共存 |
||||
|
- 专用策略可设置更高优先级,确保精确匹配优先 |
||||
|
- 通用策略作为兜底,提高系统兼容性 |
||||
|
|
||||
|
## 命令功能对比 |
||||
|
|
||||
|
| 命令 | 功能 | 是否存储 | |
||||
|
|------|------|----------| |
||||
|
| `crawl <url>` | 爬取并存储文章 | ✅ 是 | |
||||
|
| `analyze <url>` | 分析文章统计(不存储) | ❌ 否 | |
||||
|
| `list` | 列出已存储文章 | - | |
||||
|
| `history` | 显示命令历史 | - | |
||||
|
| `help` | 显示帮助 | - | |
||||
|
| `exit` | 退出程序 | - | |
||||
|
|
||||
|
## 设计模式应用 |
||||
|
|
||||
|
1. **策略模式**:`CrawlStrategy` 接口定义标准,各策略独立实现 |
||||
|
2. **工厂模式**:`StrategyFactory` 根据 URL 自动选择策略 |
||||
|
3. **Repository 模式**:数据访问封装,防御式编程 |
||||
|
4. **命令模式**:所有 Command 统一签名,易于扩展 |
||||
@ -0,0 +1,87 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.strategy.CrawlStrategy; |
||||
|
import com.example.datacollect.strategy.StrategyFactory; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public class AnalyzeCommand implements Command { |
||||
|
private final ConsoleView view; |
||||
|
private final StrategyFactory strategyFactory; |
||||
|
|
||||
|
public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) { |
||||
|
this.view = view; |
||||
|
this.strategyFactory = strategyFactory; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "analyze"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
if (args.length < 2) { |
||||
|
view.printError("Usage: analyze <url>"); |
||||
|
return; |
||||
|
} |
||||
|
String url = args[1]; |
||||
|
|
||||
|
CrawlStrategy strategy = strategyFactory.getStrategy(url); |
||||
|
if (strategy == null) { |
||||
|
view.printError("No strategy found for: " + url); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
view.printInfo("Analyzing: " + url); |
||||
|
Document doc = Jsoup.connect(url).get(); |
||||
|
List<Article> parsed = strategy.parse(url, doc); |
||||
|
|
||||
|
view.printInfo("=== Analysis Report ==="); |
||||
|
view.printInfo("Total articles found: " + parsed.size()); |
||||
|
|
||||
|
int titlesWithAuthor = 0; |
||||
|
int titlesWithDate = 0; |
||||
|
int titlesWithContent = 0; |
||||
|
|
||||
|
for (Article article : parsed) { |
||||
|
if (article.getAuthor() != null && !article.getAuthor().isEmpty()) { |
||||
|
titlesWithAuthor++; |
||||
|
} |
||||
|
if (article.getPublishDate() != null && !article.getPublishDate().isEmpty()) { |
||||
|
titlesWithDate++; |
||||
|
} |
||||
|
if (article.getContent() != null && !article.getContent().isEmpty()) { |
||||
|
titlesWithContent++; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
view.printInfo("Articles with author: " + titlesWithAuthor); |
||||
|
view.printInfo("Articles with publish date: " + titlesWithDate); |
||||
|
view.printInfo("Articles with content: " + titlesWithContent); |
||||
|
view.printInfo("Strategy used: " + strategy.getClass().getSimpleName()); |
||||
|
|
||||
|
if (parsed.size() > 0) { |
||||
|
view.printInfo("\nSample article titles:"); |
||||
|
int limit = Math.min(3, parsed.size()); |
||||
|
for (int i = 0; i < limit; i++) { |
||||
|
view.printInfo("- " + parsed.get(i).getTitle()); |
||||
|
} |
||||
|
if (parsed.size() > 3) { |
||||
|
view.printInfo("... and " + (parsed.size() - 3) + " more"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
view.printSuccess("Analysis completed (not stored)"); |
||||
|
|
||||
|
} catch (Exception e) { |
||||
|
view.printError("Failed to analyze: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,8 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
|
||||
|
public interface Command { |
||||
|
String getName(); |
||||
|
void execute(String[] args, ArticleRepository repository); |
||||
|
} |
||||
@ -0,0 +1,50 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.strategy.CrawlStrategy; |
||||
|
import com.example.datacollect.strategy.StrategyFactory; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
|
||||
|
public class CrawlCommand implements Command { |
||||
|
private final ConsoleView view; |
||||
|
private final StrategyFactory strategyFactory; |
||||
|
|
||||
|
public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { |
||||
|
this.view = view; |
||||
|
this.strategyFactory = strategyFactory; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "crawl"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
if (args.length < 2) { |
||||
|
view.printError("Usage: crawl <url>"); |
||||
|
return; |
||||
|
} |
||||
|
String url = args[1]; |
||||
|
|
||||
|
CrawlStrategy strategy = strategyFactory.getStrategy(url); |
||||
|
if (strategy == null) { |
||||
|
view.printError("No strategy found for: " + url); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
view.printInfo("Crawling: " + url); |
||||
|
Document doc = Jsoup.connect(url).get(); |
||||
|
var articles = strategy.parse(url, doc); |
||||
|
for (var article : articles) { |
||||
|
repository.add(article); |
||||
|
} |
||||
|
view.printSuccess("Crawled " + articles.size() + " articles."); |
||||
|
} catch (Exception e) { |
||||
|
view.printError("Failed to crawl: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,23 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
|
||||
|
public class ExitCommand implements Command { |
||||
|
private final ConsoleView view; |
||||
|
|
||||
|
public ExitCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "exit"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
view.printSuccess("Bye!"); |
||||
|
System.exit(0); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,28 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
|
||||
|
public class HelpCommand implements Command { |
||||
|
private final ConsoleView view; |
||||
|
|
||||
|
public HelpCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "help"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
view.printInfo("Commands:"); |
||||
|
view.printInfo(" crawl <url> - Crawl articles from URL and store"); |
||||
|
view.printInfo(" analyze <url> - Analyze URL without storing"); |
||||
|
view.printInfo(" list - List all stored articles"); |
||||
|
view.printInfo(" history - Show command history"); |
||||
|
view.printInfo(" help - Show this help"); |
||||
|
view.printInfo(" exit - Exit the program"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,33 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public class HistoryCommand implements Command { |
||||
|
private final ConsoleView view; |
||||
|
private final List<String> commandHistory; |
||||
|
|
||||
|
public HistoryCommand(ConsoleView view, List<String> commandHistory) { |
||||
|
this.view = view; |
||||
|
this.commandHistory = commandHistory; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "history"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
if (commandHistory.isEmpty()) { |
||||
|
view.printInfo("No command history."); |
||||
|
return; |
||||
|
} |
||||
|
view.printInfo("Command history:"); |
||||
|
for (int i = 0; i < commandHistory.size(); i++) { |
||||
|
view.printInfo((i + 1) + ". " + commandHistory.get(i)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,22 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
|
||||
|
public class ListCommand implements Command { |
||||
|
private final ConsoleView view; |
||||
|
|
||||
|
public ListCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "list"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
view.display(repository.getAll()); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,56 @@ |
|||||
|
package com.example.datacollect.controller; |
||||
|
|
||||
|
import com.example.datacollect.command.AnalyzeCommand; |
||||
|
import com.example.datacollect.command.Command; |
||||
|
import com.example.datacollect.command.CrawlCommand; |
||||
|
import com.example.datacollect.command.ExitCommand; |
||||
|
import com.example.datacollect.command.HelpCommand; |
||||
|
import com.example.datacollect.command.HistoryCommand; |
||||
|
import com.example.datacollect.command.ListCommand; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.strategy.StrategyFactory; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.HashMap; |
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class CrawlerController { |
||||
|
private final Map<String, Command> commands = new HashMap<>(); |
||||
|
private final ConsoleView view; |
||||
|
private final ArticleRepository repository; |
||||
|
private final List<String> commandHistory = new ArrayList<>(); |
||||
|
|
||||
|
public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) { |
||||
|
this.view = view; |
||||
|
this.repository = repository; |
||||
|
register(new HelpCommand(view)); |
||||
|
register(new ListCommand(view)); |
||||
|
register(new CrawlCommand(view, strategyFactory)); |
||||
|
register(new AnalyzeCommand(view, strategyFactory)); |
||||
|
register(new ExitCommand(view)); |
||||
|
register(new HistoryCommand(view, commandHistory)); |
||||
|
} |
||||
|
|
||||
|
private void register(Command command) { |
||||
|
commands.put(command.getName(), command); |
||||
|
} |
||||
|
|
||||
|
public void handle(String input) { |
||||
|
String text = input == null ? "" : input.trim(); |
||||
|
if (text.isEmpty()) { |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
commandHistory.add(text); |
||||
|
|
||||
|
String[] args = text.split("\\s+"); |
||||
|
String cmdName = args[0].toLowerCase(); |
||||
|
Command command = commands.get(cmdName); |
||||
|
if (command == null) { |
||||
|
view.printError("Unknown command: " + cmdName); |
||||
|
return; |
||||
|
} |
||||
|
command.execute(args, repository); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,73 @@ |
|||||
|
package com.example.datacollect.model; |
||||
|
|
||||
|
public class Article { |
||||
|
private String title; |
||||
|
private String url; |
||||
|
private String content; |
||||
|
private String author; |
||||
|
private String publishDate; |
||||
|
|
||||
|
public Article(String title, String url, String content) { |
||||
|
this.title = title; |
||||
|
this.url = url; |
||||
|
this.content = content; |
||||
|
} |
||||
|
|
||||
|
public Article(String title, String url, String content, String author, String publishDate) { |
||||
|
this.title = title; |
||||
|
this.url = url; |
||||
|
this.content = content; |
||||
|
this.author = author; |
||||
|
this.publishDate = publishDate; |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public void setTitle(String title) { |
||||
|
this.title = title; |
||||
|
} |
||||
|
|
||||
|
public String getUrl() { |
||||
|
return url; |
||||
|
} |
||||
|
|
||||
|
public void setUrl(String url) { |
||||
|
this.url = url; |
||||
|
} |
||||
|
|
||||
|
public String getContent() { |
||||
|
return content; |
||||
|
} |
||||
|
|
||||
|
public void setContent(String content) { |
||||
|
this.content = content; |
||||
|
} |
||||
|
|
||||
|
public String getAuthor() { |
||||
|
return author; |
||||
|
} |
||||
|
|
||||
|
public void setAuthor(String author) { |
||||
|
this.author = author; |
||||
|
} |
||||
|
|
||||
|
public String getPublishDate() { |
||||
|
return publishDate; |
||||
|
} |
||||
|
|
||||
|
public void setPublishDate(String publishDate) { |
||||
|
this.publishDate = publishDate; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Article{" |
||||
|
+ "title='" + title + '\'' |
||||
|
+ ", url='" + url + '\'' |
||||
|
+ ", author='" + author + '\'' |
||||
|
+ ", publishDate='" + publishDate + '\'' |
||||
|
+ '}'; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,41 @@ |
|||||
|
package com.example.datacollect.repository; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.Collections; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class ArticleRepository { |
||||
|
private final List<Article> articles = new ArrayList<>(); |
||||
|
|
||||
|
public void add(Article article) { |
||||
|
if (article == null) { |
||||
|
throw new IllegalArgumentException("Article cannot be null"); |
||||
|
} |
||||
|
articles.add(article); |
||||
|
} |
||||
|
|
||||
|
public void addAll(List<Article> newArticles) { |
||||
|
if (newArticles == null) { |
||||
|
throw new IllegalArgumentException("Article list cannot be null"); |
||||
|
} |
||||
|
for (Article article : newArticles) { |
||||
|
if (article == null) { |
||||
|
throw new IllegalArgumentException("Article in list cannot be null"); |
||||
|
} |
||||
|
articles.add(article); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public List<Article> getAll() { |
||||
|
return Collections.unmodifiableList(articles); |
||||
|
} |
||||
|
|
||||
|
public int size() { |
||||
|
return articles.size(); |
||||
|
} |
||||
|
|
||||
|
public void clear() { |
||||
|
articles.clear(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,25 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class BlogStrategy implements CrawlStrategy { |
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url.contains("blog.example.com"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
Elements titles = doc.select(".post-title"); |
||||
|
for (Element e : titles) { |
||||
|
articles.add(new Article(e.text(), url, "")); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,19 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import java.util.List; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public interface CrawlStrategy { |
||||
|
List<Article> parse(String url, Document doc); |
||||
|
boolean supports(String url); |
||||
|
|
||||
|
default int getPriority() { |
||||
|
return 1; |
||||
|
} |
||||
|
|
||||
|
default Pattern getPattern() { |
||||
|
return null; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,57 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class GenericNewsStrategy implements CrawlStrategy { |
||||
|
private static final Pattern PATTERN = Pattern.compile(".*\\.(news|press|article)s?\\..*"); |
||||
|
private static final int PRIORITY = 5; |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return PATTERN.matcher(url).find(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
|
||||
|
Elements items = doc.select("article, .news-item, .article-item, [class*='news'], [class*='article']"); |
||||
|
|
||||
|
for (Element item : items) { |
||||
|
String title = item.selectFirst("h1, h2, h3, .title, [class*='title']") != null |
||||
|
? item.selectFirst("h1, h2, h3, .title, [class*='title']").text().trim() |
||||
|
: ""; |
||||
|
|
||||
|
String articleUrl = item.selectFirst("a[href]") != null |
||||
|
? item.selectFirst("a[href]").attr("abs:href") |
||||
|
: url; |
||||
|
|
||||
|
String content = item.selectFirst("p, .content, [class*='content']") != null |
||||
|
? item.selectFirst("p, .content, [class*='content']").text().trim() |
||||
|
: ""; |
||||
|
|
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, articleUrl, content)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public int getPriority() { |
||||
|
return PRIORITY; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public Pattern getPattern() { |
||||
|
return PATTERN; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,49 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class HnuNewsStrategy implements CrawlStrategy { |
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url.contains("news.hnu.edu.cn"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
Elements listItems = doc.select("ul.list11 li"); |
||||
|
|
||||
|
for (Element li : listItems) { |
||||
|
Element link = li.selectFirst("a"); |
||||
|
if (link == null) continue; |
||||
|
|
||||
|
String articleUrl = link.attr("href"); |
||||
|
if (!articleUrl.startsWith("http")) { |
||||
|
articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", ""); |
||||
|
} |
||||
|
|
||||
|
String title = ""; |
||||
|
Element titleEl = link.selectFirst("h4.l2.h4s2"); |
||||
|
if (titleEl != null) { |
||||
|
title = titleEl.text().trim(); |
||||
|
} |
||||
|
|
||||
|
String content = ""; |
||||
|
Element contentEl = link.selectFirst("p.l3.ps3"); |
||||
|
if (contentEl != null) { |
||||
|
content = contentEl.text().trim(); |
||||
|
} |
||||
|
|
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, articleUrl, content)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return articles; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,25 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class NewsStrategy implements CrawlStrategy { |
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url.contains("news.example.com"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
Elements items = doc.select(".article-headline"); |
||||
|
for (Element e : items) { |
||||
|
articles.add(new Article(e.text(), url, "")); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,107 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class StrategyFactory { |
||||
|
private final List<CrawlStrategy> strategies = new ArrayList<>(); |
||||
|
private CrawlStrategy defaultStrategy; |
||||
|
|
||||
|
public StrategyFactory() { |
||||
|
strategies.add(new HnuNewsStrategy()); |
||||
|
strategies.add(new BlogStrategy()); |
||||
|
strategies.add(new NewsStrategy()); |
||||
|
strategies.add(new GenericNewsStrategy()); |
||||
|
defaultStrategy = new DefaultStrategy(); |
||||
|
} |
||||
|
|
||||
|
public CrawlStrategy getStrategy(String url) { |
||||
|
CrawlStrategy matched = null; |
||||
|
int highestPriority = Integer.MIN_VALUE; |
||||
|
|
||||
|
for (CrawlStrategy s : strategies) { |
||||
|
boolean supports = false; |
||||
|
|
||||
|
Pattern pattern = s.getPattern(); |
||||
|
if (pattern != null) { |
||||
|
supports = pattern.matcher(url).find(); |
||||
|
} else { |
||||
|
supports = s.supports(url); |
||||
|
} |
||||
|
|
||||
|
if (supports) { |
||||
|
int priority = s.getPriority(); |
||||
|
if (priority > highestPriority) { |
||||
|
highestPriority = priority; |
||||
|
matched = s; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (matched != null) { |
||||
|
return matched; |
||||
|
} |
||||
|
|
||||
|
return defaultStrategy; |
||||
|
} |
||||
|
|
||||
|
public void register(CrawlStrategy strategy) { |
||||
|
strategies.add(strategy); |
||||
|
} |
||||
|
|
||||
|
public void register(CrawlStrategy strategy, int priority) { |
||||
|
strategies.add(new PrioritizedStrategy(strategy, priority)); |
||||
|
} |
||||
|
|
||||
|
public void setDefaultStrategy(CrawlStrategy defaultStrategy) { |
||||
|
this.defaultStrategy = defaultStrategy; |
||||
|
} |
||||
|
|
||||
|
private static class PrioritizedStrategy implements CrawlStrategy { |
||||
|
private final CrawlStrategy delegate; |
||||
|
private final int priority; |
||||
|
|
||||
|
public PrioritizedStrategy(CrawlStrategy delegate, int priority) { |
||||
|
this.delegate = delegate; |
||||
|
this.priority = priority; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) { |
||||
|
return delegate.parse(url, doc); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return delegate.supports(url); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public int getPriority() { |
||||
|
return priority; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public Pattern getPattern() { |
||||
|
return delegate.getPattern(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private static class DefaultStrategy implements CrawlStrategy { |
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) { |
||||
|
return List.of(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public int getPriority() { |
||||
|
return Integer.MIN_VALUE; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,42 @@ |
|||||
|
package com.example.datacollect.view; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import java.util.List; |
||||
|
import java.util.Scanner; |
||||
|
|
||||
|
public class ConsoleView { |
||||
|
private static final String ANSI_RESET = "\u001B[0m"; |
||||
|
private static final String ANSI_GREEN = "\u001B[32m"; |
||||
|
private static final String ANSI_RED = "\u001B[31m"; |
||||
|
private static final String ANSI_BLUE = "\u001B[34m"; |
||||
|
|
||||
|
private final Scanner scanner = new Scanner(System.in); |
||||
|
|
||||
|
public String readLine() { |
||||
|
System.out.print("> "); |
||||
|
return scanner.nextLine(); |
||||
|
} |
||||
|
|
||||
|
public void printSuccess(String msg) { |
||||
|
System.out.println(ANSI_GREEN + msg + ANSI_RESET); |
||||
|
} |
||||
|
|
||||
|
public void printError(String msg) { |
||||
|
System.out.println(ANSI_RED + msg + ANSI_RESET); |
||||
|
} |
||||
|
|
||||
|
public void printInfo(String msg) { |
||||
|
System.out.println(ANSI_BLUE + msg + ANSI_RESET); |
||||
|
} |
||||
|
|
||||
|
public void display(List<Article> articles) { |
||||
|
if (articles.isEmpty()) { |
||||
|
printInfo("暂无文章,请先执行 crawl。"); |
||||
|
return; |
||||
|
} |
||||
|
for (int i = 0; i < articles.size(); i++) { |
||||
|
Article a = articles.get(i); |
||||
|
System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue