Browse Source

王烊烊202302050115W11

master
WangYangyang 1 month ago
parent
commit
506794f9d9
  1. 34
      w11/datacollect/Main.java
  2. 101
      w11/datacollect/command/AnalyzeCommand.java
  3. 8
      w11/datacollect/command/Command.java
  4. 115
      w11/datacollect/command/CrawlCommand.java
  5. 34
      w11/datacollect/command/ExitCommand.java
  6. 32
      w11/datacollect/command/HelpCommand.java
  7. 33
      w11/datacollect/command/ListCommand.java
  8. 62
      w11/datacollect/controller/CrawlerController.java
  9. 20
      w11/datacollect/exception/CrawlerException.java
  10. 19
      w11/datacollect/exception/NetworkException.java
  11. 19
      w11/datacollect/exception/ParseException.java
  12. 45
      w11/datacollect/model/Article.java
  13. 100
      w11/datacollect/repository/ArticleRepository.java
  14. 47
      w11/datacollect/strategy/BlogStrategy.java
  15. 15
      w11/datacollect/strategy/CrawlStrategy.java
  16. 25
      w11/datacollect/strategy/DefaultStrategy.java
  17. 95
      w11/datacollect/strategy/HnuNewsStrategy.java
  18. 57
      w11/datacollect/strategy/NewsStrategy.java
  19. 29
      w11/datacollect/strategy/StrategyFactory.java
  20. 56
      w11/datacollect/view/ConsoleView.java
  21. 39
      w11/logback.xml
  22. 96
      w11/pom.xml

34
w11/datacollect/Main.java

@ -0,0 +1,34 @@
package com.example.datacollect;
import com.example.datacollect.controller.CrawlerController;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.strategy.StrategyFactory;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Main {
private static final Logger logger = LoggerFactory.getLogger(Main.class);
public static void main(String[] args) {
logger.info("Starting CLI Crawler (w10_3)");
ConsoleView view = new ConsoleView();
ArticleRepository repository = new ArticleRepository();
StrategyFactory strategyFactory = new StrategyFactory();
CrawlerController controller = new CrawlerController(view, repository, strategyFactory);
String welcomeMsg = "Welcome to CLI Crawler (w10_3)! Type help for commands.";
logger.info(welcomeMsg);
view.printSuccess(welcomeMsg);
try {
while (true) {
controller.handle(view.readLine());
}
} catch (Exception e) {
logger.error("Unexpected error in main loop", e);
view.printError("System error: " + e.getMessage());
System.exit(1);
}
}
}

101
w11/datacollect/command/AnalyzeCommand.java

@ -0,0 +1,101 @@
package com.example.datacollect.command;
import com.example.datacollect.model.Article;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.strategy.CrawlStrategy;
import com.example.datacollect.strategy.StrategyFactory;
import com.example.datacollect.view.ConsoleView;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.stream.Collectors;
public class AnalyzeCommand implements Command {
// 1. 添加 Logger 成员
private static final Logger logger = LoggerFactory.getLogger(AnalyzeCommand.class);
private final ConsoleView view;
private final StrategyFactory strategyFactory;
public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) {
this.view = view;
this.strategyFactory = strategyFactory;
}
@Override
public String getName() {
return "analyze";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
if (args.length < 2) {
logger.error("指令参数错误,正确用法: analyze <url>");
view.printError("Usage: analyze <url>"); // 保留控制台提示,方便用户直接看到
return;
}
String url = args[1];
CrawlStrategy strategy = strategyFactory.getStrategy(url);
if (strategy == null) {
logger.error("未找到适用于 URL [{}] 的抓取策略", url);
view.printError("No strategy found for: " + url);
return;
}
try {
logger.info("开始分析目标网站: {}", url);
Document doc = Jsoup.connect(url).get();
// 调用策略解析,但不存入 Repository
List<Article> articles = strategy.parse(url, doc);
// 统计信息
int total = articles.size();
double avgTitleLen = articles.stream()
.mapToInt(a -> a.getTitle().length())
.average()
.orElse(0.0);
// Top 5 按标题长度排序
List<Article> top5 = articles.stream()
.sorted((a, b) -> Integer.compare(b.getTitle().length(), a.getTitle().length()))
.limit(5)
.collect(Collectors.toList());
// 输出结果到日志
logger.info("=== 分析结果 ===");
logger.info("提取文章总数: {}", total);
logger.info("平均标题长度: {:.2f} 字符", avgTitleLen);
logger.info("Top 5 文章 (按标题长度排序):");
int rank = 1;
for (Article a : top5) {
logger.info("{}. {} ({} 字符)", rank, a.getTitle(), a.getTitle().length());
rank++;
}
logger.info("==================");
// 保留原有的控制台输出,确保用户交互体验不受影响
view.printInfo("=== Analysis Result ===");
view.printInfo("Total Articles: " + total);
view.printInfo("Avg Title Length: " + String.format("%.2f", avgTitleLen));
view.printInfo("Top 5 Articles (by Title Length):");
rank = 1;
for (Article a : top5) {
view.printInfo(rank + ". " + a.getTitle() + " (" + a.getTitle().length() + " chars)");
rank++;
}
view.printInfo("========================");
} catch (Exception e) {
logger.error("分析 URL [{}] 时发生异常: ", url, e); // 传入异常对象 e,以便记录完整堆栈
view.printError("Failed to analyze: " + e.getMessage());
}
}
}

8
w11/datacollect/command/Command.java

@ -0,0 +1,8 @@
package com.example.datacollect.command;
import com.example.datacollect.repository.ArticleRepository;
public interface Command {
String getName();
void execute(String[] args, ArticleRepository repository);
}

115
w11/datacollect/command/CrawlCommand.java

@ -0,0 +1,115 @@
package com.example.datacollect.command;
import com.example.datacollect.exception.NetworkException;
import com.example.datacollect.exception.ParseException;
import com.example.datacollect.model.Article;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.strategy.CrawlStrategy;
import com.example.datacollect.strategy.StrategyFactory;
import com.example.datacollect.view.ConsoleView;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.concurrent.TimeUnit;
public class CrawlCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class);
private static final int MAX_RETRY = 3; // 最大重试次数
private static final long RETRY_INTERVAL = 1000; // 重试间隔(毫秒)
private final ConsoleView view;
private final StrategyFactory strategyFactory;
public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) {
this.view = view;
this.strategyFactory = strategyFactory;
}
@Override
public String getName() {
return "crawl";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
if (args.length < 2) {
String errorMsg = "Crawl command usage: crawl <url>";
logger.error(errorMsg);
view.printError(errorMsg);
return;
}
String url = args[1];
logger.info("Start crawling url: {}", url);
CrawlStrategy strategy = strategyFactory.getStrategy(url);
if (strategy == null) {
String errorMsg = "No crawl strategy found for url: " + url;
logger.error(errorMsg);
view.printError(errorMsg);
return;
}
// 重试逻辑
int retryCount = 0;
while (retryCount < MAX_RETRY) {
try {
Document doc = fetchDocumentWithRetry(url, retryCount);
List<Article> articles = strategy.parse(url, doc);
// 批量添加(复用Repository的addAll方法)
repository.addAll(articles);
String successMsg = "Crawled " + articles.size() + " articles from url: " + url;
logger.info(successMsg);
view.printSuccess(successMsg);
return; // 成功则退出重试循环
} catch (NetworkException e) {
retryCount++;
String retryMsg = String.format("Network error (retry %d/%d): %s", retryCount, MAX_RETRY, e.getMessage());
logger.warn(retryMsg);
view.printError(retryMsg);
if (retryCount >= MAX_RETRY) {
String failMsg = "Failed to crawl url after " + MAX_RETRY + " retries: " + url;
logger.error(failMsg, e);
view.printError(failMsg);
}
// 重试间隔
try {
TimeUnit.MILLISECONDS.sleep(RETRY_INTERVAL);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
logger.error("Retry sleep interrupted", ie);
break;
}
} catch (ParseException e) {
String errorMsg = "Parse failed for url: " + url;
logger.error(errorMsg, e);
view.printError(errorMsg);
return; // 解析异常不重试
} catch (Exception e) {
String errorMsg = "Unexpected error when crawling url: " + url;
logger.error(errorMsg, e);
view.printError(errorMsg);
return;
}
}
}
// 抽取文档获取逻辑,抛出网络异常
private Document fetchDocumentWithRetry(String url, int retryCount) throws NetworkException {
try {
logger.debug("Fetching document (retry {}) for url: {}", retryCount, url);
return Jsoup.connect(url)
.timeout(5000) // 超时时间5秒
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.get();
} catch (Exception e) {
throw new NetworkException("Failed to fetch document (retry " + retryCount + ") for url: " + url, e);
}
}
}

34
w11/datacollect/command/ExitCommand.java

@ -0,0 +1,34 @@
package com.example.datacollect.command;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ExitCommand implements Command {
// 1. 添加 Logger 成员
private static final Logger logger = LoggerFactory.getLogger(ExitCommand.class);
private final ConsoleView view;
public ExitCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "exit";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
// 记录退出日志
logger.info("用户请求退出程序。");
view.printSuccess("Bye!");
// 在调用 exit 前可以记录一些系统状态,或者直接记录
logger.info("程序已终止。");
System.exit(0);
}
}

32
w11/datacollect/command/HelpCommand.java

@ -0,0 +1,32 @@
package com.example.datacollect.command;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class HelpCommand implements Command {
// 1. 添加 Logger 成员
private static final Logger logger = LoggerFactory.getLogger(HelpCommand.class);
private final ConsoleView view;
public HelpCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "help";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
logger.debug("用户请求查看帮助信息。");
// 保留原有的帮助信息输出
view.printInfo("Commands: crawl <url>, list, analyze, help, exit");
// 建议:将硬编码的命令列表改为动态获取(如果 Command 接口有 getType 或类似方法),目前保持原样
}
}

33
w11/datacollect/command/ListCommand.java

@ -0,0 +1,33 @@
package com.example.datacollect.command;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ListCommand implements Command {
// 1. 添加 Logger 成员
private static final Logger logger = LoggerFactory.getLogger(ListCommand.class);
private final ConsoleView view;
public ListCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "list";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
logger.info("正在执行 list 命令,准备展示已抓取的文章列表。");
// 保留原有的视图输出
view.display(repository.getAll());
logger.debug("当前仓库中共有 {} 篇文章已加载至视图。", repository.getAll().size());
}
}

62
w11/datacollect/controller/CrawlerController.java

@ -0,0 +1,62 @@
package com.example.datacollect.controller;
import com.example.datacollect.command.Command;
import com.example.datacollect.command.AnalyzeCommand;
import com.example.datacollect.command.CrawlCommand;
import com.example.datacollect.command.ExitCommand;
import com.example.datacollect.command.HelpCommand;
import com.example.datacollect.command.ListCommand;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.strategy.StrategyFactory;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.Map;
public class CrawlerController {
private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class);
private final Map<String, Command> commands = new HashMap<>();
private final ConsoleView view;
private final ArticleRepository repository;
public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) {
this.view = view;
this.repository = repository;
logger.info("Registering crawler commands");
register(new HelpCommand(view));
register(new ListCommand(view));
register(new CrawlCommand(view, strategyFactory));
register(new AnalyzeCommand(view, strategyFactory));//新增
register(new ExitCommand(view));
logger.debug("Registered commands: {}", commands.keySet());
}
private void register(Command command) {
commands.put(command.getName(), command);
logger.debug("Registered command: {}", command.getName());
}
public void handle(String input) {
String text = input == null ? "" : input.trim();
logger.debug("Handling input: {}", text);
if (text.isEmpty()) {
logger.debug("Empty input, skip handling");
return;
}
String[] args = text.split("\\s+");
String cmdName = args[0].toLowerCase();
Command command = commands.get(cmdName);
if (command == null) {
String errorMsg = "Unknown command: " + cmdName;
logger.error(errorMsg);
view.printError(errorMsg);
return;
}
logger.info("Executing command: {}", cmdName);
command.execute(args, repository);
}
}

20
w11/datacollect/exception/CrawlerException.java

@ -0,0 +1,20 @@
package com.example.datacollect.exception;
public class CrawlerException extends Exception {
public CrawlerException() {
super();
}
public CrawlerException(String message) {
super(message);
}
public CrawlerException(String message, Throwable cause) {
super(message, cause);
}
public CrawlerException(Throwable cause) {
super(cause);
}
}

19
w11/datacollect/exception/NetworkException.java

@ -0,0 +1,19 @@
package com.example.datacollect.exception;
public class NetworkException extends CrawlerException{
public NetworkException() {
super();
}
public NetworkException(String message) {
super(message);
}
public NetworkException(String message, Throwable cause) {
super(message, cause);
}
public NetworkException(Throwable cause) {
super(cause);
}
}

19
w11/datacollect/exception/ParseException.java

@ -0,0 +1,19 @@
package com.example.datacollect.exception;
public class ParseException extends CrawlerException{
public ParseException() {
super();
}
public ParseException(String message) {
super(message);
}
public ParseException(String message, Throwable cause) {
super(message, cause);
}
public ParseException(Throwable cause) {
super(cause);
}
}

45
w11/datacollect/model/Article.java

@ -0,0 +1,45 @@
package com.example.datacollect.model;
public class Article {
private String title;
private String url;
private String content;
public Article(String title, String url, String content) {
this.title = title;
this.url = url;
this.content = content;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
@Override
public String toString() {
return "Article{"
+ "title='" + title + '\''
+ ", url='" + url + '\''
+ '}';
}
}

100
w11/datacollect/repository/ArticleRepository.java

@ -0,0 +1,100 @@
package com.example.datacollect.repository;
import com.example.datacollect.model.Article;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class ArticleRepository {
private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class);
private final List<Article> articles = new ArrayList<>();
// 新增:根据索引获取文章(防御索引越界)
public Article get(int index) {
logger.debug("Getting article at index: {}", index);
if (index < 0 || index >= articles.size()) {
String errorMsg = "Index out of bounds: index=" + index + ", size=" + articles.size();
logger.error(errorMsg);
throw new IndexOutOfBoundsException(errorMsg);
}
return articles.get(index);
}
public void add(Article article) {
logger.debug("Adding article: {}", article);
if (article == null) {
String errorMsg = "Article cannot be null";
logger.error(errorMsg);
throw new IllegalArgumentException(errorMsg);
}
// 新增:防御重复添加(可选)
if (articles.contains(article)) {
logger.warn("Article already exists: {}", article);
return;
}
articles.add(article);
logger.info("Added article: {}", article.getTitle());
}
// ★ 新增:批量添加方法以及注意防御 null
public void addAll(List<Article> articles) {
logger.debug("Adding batch articles, size: {}", articles == null ? "null" : articles.size());
if (articles == null) {
String errorMsg = "Articles list cannot be null";
logger.error(errorMsg);
throw new IllegalArgumentException(errorMsg);
}
if (articles.isEmpty()) {
logger.warn("Articles list is empty, skip addAll");
return;
}
int addedCount = 0;
for (Article article : articles) {
if (article == null) {
logger.error("Skipping null article in batch add");
continue; // 或抛出异常,根据业务选择
}
if (!this.articles.contains(article)) {
this.articles.add(article);
addedCount++;
}
}
logger.info("Batch added {} articles (skipped duplicates/null)", addedCount);
}
public List<Article> getAll() {
List<Article> unmodifiableList = Collections.unmodifiableList(articles);
logger.debug("Getting all articles, size: {}", unmodifiableList.size());
return unmodifiableList;
}
public int size() {
int size = articles.size();
logger.debug("Repository size: {}", size);
return size;
}
// 新增:清空前校验 + 日志
public void clear() {
logger.warn("Clearing all articles (current size: {})", articles.size());
if (articles.isEmpty()) {
logger.info("Repository is already empty, skip clear");
return;
}
articles.clear();
logger.info("Cleared all articles successfully");
}
// 新增:检查是否包含指定URL的文章(防御检查)
public boolean containsUrl(String url) {
logger.debug("Checking if repository contains url: {}", url);
if (url == null || url.isBlank()) {
logger.error("URL cannot be null/blank");
throw new IllegalArgumentException("URL cannot be null or blank");
}
return articles.stream().anyMatch(article -> url.equals(article.getUrl()));
}
}

47
w11/datacollect/strategy/BlogStrategy.java

@ -0,0 +1,47 @@
package com.example.datacollect.strategy;
import com.example.datacollect.exception.ParseException;
import com.example.datacollect.model.Article;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
public class BlogStrategy implements CrawlStrategy {
private static final Logger logger = LoggerFactory.getLogger(BlogStrategy.class);
private static final Pattern URL_PATTERN = Pattern.compile(".*blog\\.example\\.com.*");
@Override
public boolean supports(String url) {
boolean isSupported = URL_PATTERN.matcher(url).matches();
logger.debug("URL {} support status: {}", url, isSupported);
return isSupported;
}
@Override
public List<Article> parse(String url, Document doc) throws ParseException {
try {
logger.info("Start parsing blog articles from url: {}", url);
List<Article> articles = new ArrayList<>();
Elements titles = doc.select(".post-title");
for (Element e : titles) {
articles.add(new Article(e.text(), url, ""));
}
logger.debug("Parsed {} blog articles from url: {}", articles.size(), url);
return articles;
} catch (Exception e) {
logger.error("Failed to parse blog articles from url: {}", url, e);
throw new ParseException("Blog article parse failed for url: " + url, e);
}
}
@Override
public int getPriority() {
return 10; // 优先级高于默认策略
}
}

15
w11/datacollect/strategy/CrawlStrategy.java

@ -0,0 +1,15 @@
package com.example.datacollect.strategy;
import com.example.datacollect.exception.ParseException;
import com.example.datacollect.model.Article;
import org.jsoup.nodes.Document;
import java.util.List;
public interface CrawlStrategy {
List<Article> parse(String url, Document doc) throws ParseException;
boolean supports(String url);
//增加优先级
default int getPriority(){
return 0;
}
}

25
w11/datacollect/strategy/DefaultStrategy.java

@ -0,0 +1,25 @@
package com.example.datacollect.strategy;
import com.example.datacollect.exception.ParseException;
import com.example.datacollect.model.Article;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
public class DefaultStrategy implements CrawlStrategy {
private static final Logger logger = LoggerFactory.getLogger(DefaultStrategy.class);
@Override
public boolean supports(String url) {
logger.debug("默认策略支持所有 URL:{}", url);
return true;
}
@Override
public List<Article> parse(String url, Document doc) throws ParseException {
logger.info("使用默认策略解析:{}", url);
// 你的解析逻辑
return List.of();
}
}

95
w11/datacollect/strategy/HnuNewsStrategy.java

@ -0,0 +1,95 @@
package com.example.datacollect.strategy;
import com.example.datacollect.model.Article;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
public class HnuNewsStrategy implements CrawlStrategy {
// 1. 添加 Logger 成员
private static final Logger logger = LoggerFactory.getLogger(HnuNewsStrategy.class);
// 2. 修正 URL 匹配逻辑(原逻辑仅匹配域名,建议增加路径灵活性)
private static final Pattern URL_PATTERN = Pattern.compile(".*news\\.hnu\\.edu\\.cn.*");
@Override
public boolean supports(String url) {
return URL_PATTERN.matcher(url).matches();
}
@Override
public List<Article> parse(String url, Document doc) {
List<Article> articles = new ArrayList<>();
// 原有逻辑:尝试选择列表项
// 注意:根据2026年5月的网页结构,实际可能需要调整为 div 或其他容器
Elements listItems = doc.select("ul.list11 li");
if (listItems.isEmpty()) {
logger.warn("在 URL [{}] 中未找到符合选择器 'ul.list11 li' 的新闻列表项。可能网页结构已更新。", url);
return articles;
}
for (Element li : listItems) {
Element link = li.selectFirst("a");
if (link == null) {
logger.debug("跳过一个无链接的列表项: {}", li.toString());
continue;
}
String articleUrl = link.attr("href");
// 3. 修正 URL 拼接逻辑(原逻辑 replace("..") 可能不够健壮)
if (!articleUrl.startsWith("http")) {
// 使用 URI 或简单的字符串处理来规范化路径
articleUrl = "https://news.hnu.edu.cn/" + articleUrl;
// 这里简单处理,实际可能需要更复杂的路径规范化
while (articleUrl.contains("/../")) {
int index = articleUrl.indexOf("/../");
int prevSlash = articleUrl.lastIndexOf('/', index - 1);
if (prevSlash != -1) {
articleUrl = articleUrl.substring(0, prevSlash) + articleUrl.substring(index + 3);
} else {
break;
}
}
}
String title = "";
Element titleEl = link.selectFirst("h4.l2.h4s2");
if (titleEl != null) {
title = titleEl.text().trim();
} else {
logger.debug("在链接 [{}] 中未找到标题元素 h4.l2.h4s2", articleUrl);
}
String content = "";
Element contentEl = link.selectFirst("p.l3.ps3");
if (contentEl != null) {
content = contentEl.text().trim();
}
// 不再输出空内容警告,因 content 可能为空
if (!title.isEmpty()) {
articles.add(new Article(title, articleUrl, content));
logger.debug("解析到新闻条目: [标题] {} - [URL] {}", title, articleUrl);
} else {
logger.trace("跳过空标题的链接: {}", articleUrl);
}
}
logger.info("成功解析 URL [{}],共提取 {} 篇新闻。", url, articles.size());
return articles;
}
@Override
public int getPriority() {
return 15;
}
}

57
w11/datacollect/strategy/NewsStrategy.java

@ -0,0 +1,57 @@
package com.example.datacollect.strategy;
import com.example.datacollect.model.Article;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
public class NewsStrategy implements CrawlStrategy {
// 1. 添加 Logger 成员
private static final Logger logger = LoggerFactory.getLogger(NewsStrategy.class);
// 使用正则匹配
private static final Pattern URL_PATTERN = Pattern.compile(".*news\\.example\\.com.*");
@Override
public boolean supports(String url) {
return URL_PATTERN.matcher(url).matches();
}
@Override
public List<Article> parse(String url, Document doc) {
List<Article> articles = new ArrayList<>();
// 2. 添加解析过程日志
logger.debug("开始解析 URL: [{}]", url);
Elements items = doc.select(".article-headline");
if (items.isEmpty()) {
logger.warn("在 URL [{}] 中未找到符合选择器 '.article-headline' 的文章标题元素。", url);
return articles;
}
for (Element e : items) {
String title = e.text().trim();
if (!title.isEmpty()) {
articles.add(new Article(title, url, ""));
logger.trace("提取到文章标题: {}", title);
}
}
logger.info("成功解析 URL [{}],共提取 {} 篇文章。", url, articles.size());
return articles;
}
@Override
public int getPriority() {
return 10;
}
}

29
w11/datacollect/strategy/StrategyFactory.java

@ -0,0 +1,29 @@
package com.example.datacollect.strategy;
import java.util.ArrayList;
import java.util.List;
public class StrategyFactory {
private final List<CrawlStrategy> strategies = new ArrayList<>();
public StrategyFactory() {
strategies.add(new HnuNewsStrategy());
strategies.add(new BlogStrategy());
strategies.add(new NewsStrategy());
//注册默认策略
strategies.add(new DefaultStrategy());
}
public CrawlStrategy getStrategy(String url) {
//按优先级降序排序
return strategies.stream()
.sorted((s1, s2) -> Integer.compare(s2.getPriority(), s1.getPriority()))
.filter(s -> s.supports(url))
.findFirst()
.orElse(null); // 如果默认策略未匹配到,返回 null 或默认策略本身
}
public void register(CrawlStrategy strategy) {
strategies.add(strategy);
}
}

56
w11/datacollect/view/ConsoleView.java

@ -0,0 +1,56 @@
package com.example.datacollect.view;
import com.example.datacollect.model.Article;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.Scanner;
public class ConsoleView {
private static final Logger logger = LoggerFactory.getLogger(ConsoleView.class);
private static final String ANSI_RESET = "\u001B[0m";
private static final String ANSI_GREEN = "\u001B[32m";
private static final String ANSI_RED = "\u001B[31m";
private static final String ANSI_BLUE = "\u001B[34m";
private final Scanner scanner = new Scanner(System.in);
public String readLine() {
System.out.print("> ");
String input = scanner.nextLine();
logger.debug("User input: {}", input);
return input;
}
public void printSuccess(String msg) {
logger.info(msg);
System.out.println(ANSI_GREEN + msg + ANSI_RESET);
}
public void printError(String msg) {
logger.error(msg);
System.out.println(ANSI_RED + msg + ANSI_RESET);
}
public void printInfo(String msg) {
logger.info(msg);
System.out.println(ANSI_BLUE + msg + ANSI_RESET);
}
public void display(List<Article> articles) {
logger.debug("Displaying {} articles", articles.size());
if (articles.isEmpty()) {
String emptyMsg = "暂无文章,请先执行 crawl。";
logger.info(emptyMsg);
printInfo(emptyMsg);
return;
}
for (int i = 0; i < articles.size(); i++) {
Article a = articles.get(i);
String articleStr = (i + 1) + ". " + a.getTitle() + " | " + a.getUrl();
System.out.println(articleStr);
logger.debug(articleStr);
}
}
}

39
w11/logback.xml

@ -0,0 +1,39 @@
<?xml version="1.0" encoding="UTF-8"?>
<configuration scan="true" scanPeriod="30 seconds">
<!-- 控制台输出 -->
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} - %msg%n</pattern>
<charset>UTF-8</charset>
</encoder>
</appender>
<!-- 文件输出(按天滚动) -->
<appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>logs/crawler.log</file>
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
<fileNamePattern>logs/crawler.%d{yyyy-MM-dd}.log</fileNamePattern>
<maxHistory>7</maxHistory> <!-- 保留7天日志 -->
<totalSizeCap>100MB</totalSizeCap> <!-- 总日志大小限制 -->
</rollingPolicy>
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} - %msg%n</pattern>
<charset>UTF-8</charset>
</encoder>
</appender>
<!-- 根日志级别 -->
<root level="INFO">
<appender-ref ref="CONSOLE"/>
<appender-ref ref="FILE"/>
</root>
<!-- 自定义包日志级别 -->
<logger name="com.example.datacollect" level="DEBUG" additivity="false">
<appender-ref ref="CONSOLE"/>
<appender-ref ref="FILE"/>
</logger>
<!-- 第三方库日志级别(降低jsoup日志) -->
<logger name="org.jsoup" level="WARN"/>
</configuration>

96
w11/pom.xml

@ -0,0 +1,96 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.example</groupId>
<artifactId>datacollect-cli</artifactId>
<version>0.1.0</version>
<packaging>jar</packaging>
<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<!-- 统一日志版本,避免冲突 -->
<slf4j.version>2.0.9</slf4j.version>
<logback.version>1.4.14</logback.version>
</properties>
<repositories>
<!-- 添加阿里云镜像,加速下载 -->
<repository>
<id>aliyun</id>
<name>Aliyun Maven</name>
<url>https://maven.aliyun.com/repository/public</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<!-- SLF4J 核心 API -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<!-- Logback 实现 -->
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>${logback.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<source>${maven.compiler.source}</source>
<target>${maven.compiler.target}</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<archive>
<manifest>
<mainClass>com.example.datacollect.Main</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
Loading…
Cancel
Save