From 506794f9d9efafc2b790d73d3dc7ba275464e064 Mon Sep 17 00:00:00 2001 From: WangYangyang <3093159564@qq.com> Date: Tue, 19 May 2026 17:16:57 +0800 Subject: [PATCH] =?UTF-8?q?=E7=8E=8B=E7=83=8A=E7=83=8A202302050115W11?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- w11/datacollect/Main.java | 34 ++++++ w11/datacollect/command/AnalyzeCommand.java | 101 +++++++++++++++ w11/datacollect/command/Command.java | 8 ++ w11/datacollect/command/CrawlCommand.java | 115 ++++++++++++++++++ w11/datacollect/command/ExitCommand.java | 34 ++++++ w11/datacollect/command/HelpCommand.java | 32 +++++ w11/datacollect/command/ListCommand.java | 33 +++++ .../controller/CrawlerController.java | 62 ++++++++++ .../exception/CrawlerException.java | 20 +++ .../exception/NetworkException.java | 19 +++ w11/datacollect/exception/ParseException.java | 19 +++ w11/datacollect/model/Article.java | 45 +++++++ .../repository/ArticleRepository.java | 100 +++++++++++++++ w11/datacollect/strategy/BlogStrategy.java | 47 +++++++ w11/datacollect/strategy/CrawlStrategy.java | 15 +++ w11/datacollect/strategy/DefaultStrategy.java | 25 ++++ w11/datacollect/strategy/HnuNewsStrategy.java | 95 +++++++++++++++ w11/datacollect/strategy/NewsStrategy.java | 57 +++++++++ w11/datacollect/strategy/StrategyFactory.java | 29 +++++ w11/datacollect/view/ConsoleView.java | 56 +++++++++ w11/logback.xml | 39 ++++++ w11/pom.xml | 96 +++++++++++++++ 22 files changed, 1081 insertions(+) create mode 100644 w11/datacollect/Main.java create mode 100644 w11/datacollect/command/AnalyzeCommand.java create mode 100644 w11/datacollect/command/Command.java create mode 100644 w11/datacollect/command/CrawlCommand.java create mode 100644 w11/datacollect/command/ExitCommand.java create mode 100644 w11/datacollect/command/HelpCommand.java create mode 100644 w11/datacollect/command/ListCommand.java create mode 100644 w11/datacollect/controller/CrawlerController.java create mode 100644 w11/datacollect/exception/CrawlerException.java create mode 100644 w11/datacollect/exception/NetworkException.java create mode 100644 w11/datacollect/exception/ParseException.java create mode 100644 w11/datacollect/model/Article.java create mode 100644 w11/datacollect/repository/ArticleRepository.java create mode 100644 w11/datacollect/strategy/BlogStrategy.java create mode 100644 w11/datacollect/strategy/CrawlStrategy.java create mode 100644 w11/datacollect/strategy/DefaultStrategy.java create mode 100644 w11/datacollect/strategy/HnuNewsStrategy.java create mode 100644 w11/datacollect/strategy/NewsStrategy.java create mode 100644 w11/datacollect/strategy/StrategyFactory.java create mode 100644 w11/datacollect/view/ConsoleView.java create mode 100644 w11/logback.xml create mode 100644 w11/pom.xml diff --git a/w11/datacollect/Main.java b/w11/datacollect/Main.java new file mode 100644 index 0000000..00d27b8 --- /dev/null +++ b/w11/datacollect/Main.java @@ -0,0 +1,34 @@ +package com.example.datacollect; + +import com.example.datacollect.controller.CrawlerController; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class Main { + private static final Logger logger = LoggerFactory.getLogger(Main.class); + + public static void main(String[] args) { + logger.info("Starting CLI Crawler (w10_3)"); + ConsoleView view = new ConsoleView(); + ArticleRepository repository = new ArticleRepository(); + StrategyFactory strategyFactory = new StrategyFactory(); + CrawlerController controller = new CrawlerController(view, repository, strategyFactory); + + String welcomeMsg = "Welcome to CLI Crawler (w10_3)! Type help for commands."; + logger.info(welcomeMsg); + view.printSuccess(welcomeMsg); + + try { + while (true) { + controller.handle(view.readLine()); + } + } catch (Exception e) { + logger.error("Unexpected error in main loop", e); + view.printError("System error: " + e.getMessage()); + System.exit(1); + } + } +} \ No newline at end of file diff --git a/w11/datacollect/command/AnalyzeCommand.java b/w11/datacollect/command/AnalyzeCommand.java new file mode 100644 index 0000000..efa1e85 --- /dev/null +++ b/w11/datacollect/command/AnalyzeCommand.java @@ -0,0 +1,101 @@ +package com.example.datacollect.command; + +import com.example.datacollect.model.Article; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.stream.Collectors; + +public class AnalyzeCommand implements Command { + + // 1. 添加 Logger 成员 + private static final Logger logger = LoggerFactory.getLogger(AnalyzeCommand.class); + + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "analyze"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + logger.error("指令参数错误,正确用法: analyze "); + view.printError("Usage: analyze "); // 保留控制台提示,方便用户直接看到 + return; + } + + String url = args[1]; + CrawlStrategy strategy = strategyFactory.getStrategy(url); + if (strategy == null) { + logger.error("未找到适用于 URL [{}] 的抓取策略", url); + view.printError("No strategy found for: " + url); + return; + } + + try { + logger.info("开始分析目标网站: {}", url); + Document doc = Jsoup.connect(url).get(); + + // 调用策略解析,但不存入 Repository + List
articles = strategy.parse(url, doc); + + // 统计信息 + int total = articles.size(); + double avgTitleLen = articles.stream() + .mapToInt(a -> a.getTitle().length()) + .average() + .orElse(0.0); + + // Top 5 按标题长度排序 + List
top5 = articles.stream() + .sorted((a, b) -> Integer.compare(b.getTitle().length(), a.getTitle().length())) + .limit(5) + .collect(Collectors.toList()); + + // 输出结果到日志 + logger.info("=== 分析结果 ==="); + logger.info("提取文章总数: {}", total); + logger.info("平均标题长度: {:.2f} 字符", avgTitleLen); + logger.info("Top 5 文章 (按标题长度排序):"); + + int rank = 1; + for (Article a : top5) { + logger.info("{}. {} ({} 字符)", rank, a.getTitle(), a.getTitle().length()); + rank++; + } + logger.info("=================="); + + // 保留原有的控制台输出,确保用户交互体验不受影响 + view.printInfo("=== Analysis Result ==="); + view.printInfo("Total Articles: " + total); + view.printInfo("Avg Title Length: " + String.format("%.2f", avgTitleLen)); + view.printInfo("Top 5 Articles (by Title Length):"); + rank = 1; + for (Article a : top5) { + view.printInfo(rank + ". " + a.getTitle() + " (" + a.getTitle().length() + " chars)"); + rank++; + } + view.printInfo("========================"); + + } catch (Exception e) { + logger.error("分析 URL [{}] 时发生异常: ", url, e); // 传入异常对象 e,以便记录完整堆栈 + view.printError("Failed to analyze: " + e.getMessage()); + } + } +} + diff --git a/w11/datacollect/command/Command.java b/w11/datacollect/command/Command.java new file mode 100644 index 0000000..029cadc --- /dev/null +++ b/w11/datacollect/command/Command.java @@ -0,0 +1,8 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; + +public interface Command { + String getName(); + void execute(String[] args, ArticleRepository repository); +} diff --git a/w11/datacollect/command/CrawlCommand.java b/w11/datacollect/command/CrawlCommand.java new file mode 100644 index 0000000..512586a --- /dev/null +++ b/w11/datacollect/command/CrawlCommand.java @@ -0,0 +1,115 @@ +package com.example.datacollect.command; + +import com.example.datacollect.exception.NetworkException; +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.concurrent.TimeUnit; + +public class CrawlCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class); + private static final int MAX_RETRY = 3; // 最大重试次数 + private static final long RETRY_INTERVAL = 1000; // 重试间隔(毫秒) + + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "crawl"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + String errorMsg = "Crawl command usage: crawl "; + logger.error(errorMsg); + view.printError(errorMsg); + return; + } + String url = args[1]; + logger.info("Start crawling url: {}", url); + + CrawlStrategy strategy = strategyFactory.getStrategy(url); + if (strategy == null) { + String errorMsg = "No crawl strategy found for url: " + url; + logger.error(errorMsg); + view.printError(errorMsg); + return; + } + + // 重试逻辑 + int retryCount = 0; + while (retryCount < MAX_RETRY) { + try { + Document doc = fetchDocumentWithRetry(url, retryCount); + List
articles = strategy.parse(url, doc); + + // 批量添加(复用Repository的addAll方法) + repository.addAll(articles); + + String successMsg = "Crawled " + articles.size() + " articles from url: " + url; + logger.info(successMsg); + view.printSuccess(successMsg); + return; // 成功则退出重试循环 + } catch (NetworkException e) { + retryCount++; + String retryMsg = String.format("Network error (retry %d/%d): %s", retryCount, MAX_RETRY, e.getMessage()); + logger.warn(retryMsg); + view.printError(retryMsg); + + if (retryCount >= MAX_RETRY) { + String failMsg = "Failed to crawl url after " + MAX_RETRY + " retries: " + url; + logger.error(failMsg, e); + view.printError(failMsg); + } + + // 重试间隔 + try { + TimeUnit.MILLISECONDS.sleep(RETRY_INTERVAL); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + logger.error("Retry sleep interrupted", ie); + break; + } + } catch (ParseException e) { + String errorMsg = "Parse failed for url: " + url; + logger.error(errorMsg, e); + view.printError(errorMsg); + return; // 解析异常不重试 + } catch (Exception e) { + String errorMsg = "Unexpected error when crawling url: " + url; + logger.error(errorMsg, e); + view.printError(errorMsg); + return; + } + } + } + + // 抽取文档获取逻辑,抛出网络异常 + private Document fetchDocumentWithRetry(String url, int retryCount) throws NetworkException { + try { + logger.debug("Fetching document (retry {}) for url: {}", retryCount, url); + return Jsoup.connect(url) + .timeout(5000) // 超时时间5秒 + .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") + .get(); + } catch (Exception e) { + throw new NetworkException("Failed to fetch document (retry " + retryCount + ") for url: " + url, e); + } + } +} diff --git a/w11/datacollect/command/ExitCommand.java b/w11/datacollect/command/ExitCommand.java new file mode 100644 index 0000000..4c5473a --- /dev/null +++ b/w11/datacollect/command/ExitCommand.java @@ -0,0 +1,34 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ExitCommand implements Command { + + // 1. 添加 Logger 成员 + private static final Logger logger = LoggerFactory.getLogger(ExitCommand.class); + + private final ConsoleView view; + + public ExitCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "exit"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + // 记录退出日志 + logger.info("用户请求退出程序。"); + view.printSuccess("Bye!"); + + // 在调用 exit 前可以记录一些系统状态,或者直接记录 + logger.info("程序已终止。"); + System.exit(0); + } +} diff --git a/w11/datacollect/command/HelpCommand.java b/w11/datacollect/command/HelpCommand.java new file mode 100644 index 0000000..bbdd263 --- /dev/null +++ b/w11/datacollect/command/HelpCommand.java @@ -0,0 +1,32 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class HelpCommand implements Command { + + // 1. 添加 Logger 成员 + private static final Logger logger = LoggerFactory.getLogger(HelpCommand.class); + + private final ConsoleView view; + + public HelpCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "help"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + logger.debug("用户请求查看帮助信息。"); + + // 保留原有的帮助信息输出 + view.printInfo("Commands: crawl , list, analyze, help, exit"); + // 建议:将硬编码的命令列表改为动态获取(如果 Command 接口有 getType 或类似方法),目前保持原样 + } +} diff --git a/w11/datacollect/command/ListCommand.java b/w11/datacollect/command/ListCommand.java new file mode 100644 index 0000000..0be5ddf --- /dev/null +++ b/w11/datacollect/command/ListCommand.java @@ -0,0 +1,33 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ListCommand implements Command { + + // 1. 添加 Logger 成员 + private static final Logger logger = LoggerFactory.getLogger(ListCommand.class); + + private final ConsoleView view; + + public ListCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "list"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + logger.info("正在执行 list 命令,准备展示已抓取的文章列表。"); + + // 保留原有的视图输出 + view.display(repository.getAll()); + + logger.debug("当前仓库中共有 {} 篇文章已加载至视图。", repository.getAll().size()); + } +} diff --git a/w11/datacollect/controller/CrawlerController.java b/w11/datacollect/controller/CrawlerController.java new file mode 100644 index 0000000..721d45a --- /dev/null +++ b/w11/datacollect/controller/CrawlerController.java @@ -0,0 +1,62 @@ +package com.example.datacollect.controller; + +import com.example.datacollect.command.Command; +import com.example.datacollect.command.AnalyzeCommand; +import com.example.datacollect.command.CrawlCommand; +import com.example.datacollect.command.ExitCommand; +import com.example.datacollect.command.HelpCommand; +import com.example.datacollect.command.ListCommand; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; +import java.util.Map; + +public class CrawlerController { + private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class); + private final Map commands = new HashMap<>(); + private final ConsoleView view; + private final ArticleRepository repository; + + public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) { + this.view = view; + this.repository = repository; + logger.info("Registering crawler commands"); + register(new HelpCommand(view)); + register(new ListCommand(view)); + register(new CrawlCommand(view, strategyFactory)); + register(new AnalyzeCommand(view, strategyFactory));//新增 + register(new ExitCommand(view)); + logger.debug("Registered commands: {}", commands.keySet()); + } + + private void register(Command command) { + commands.put(command.getName(), command); + logger.debug("Registered command: {}", command.getName()); + } + + public void handle(String input) { + String text = input == null ? "" : input.trim(); + logger.debug("Handling input: {}", text); + if (text.isEmpty()) { + logger.debug("Empty input, skip handling"); + return; + } + + String[] args = text.split("\\s+"); + String cmdName = args[0].toLowerCase(); + Command command = commands.get(cmdName); + if (command == null) { + String errorMsg = "Unknown command: " + cmdName; + logger.error(errorMsg); + view.printError(errorMsg); + return; + } + + logger.info("Executing command: {}", cmdName); + command.execute(args, repository); + } +} diff --git a/w11/datacollect/exception/CrawlerException.java b/w11/datacollect/exception/CrawlerException.java new file mode 100644 index 0000000..d282555 --- /dev/null +++ b/w11/datacollect/exception/CrawlerException.java @@ -0,0 +1,20 @@ +package com.example.datacollect.exception; + +public class CrawlerException extends Exception { + public CrawlerException() { + super(); + } + + public CrawlerException(String message) { + super(message); + } + + public CrawlerException(String message, Throwable cause) { + super(message, cause); + } + + public CrawlerException(Throwable cause) { + super(cause); + } +} + diff --git a/w11/datacollect/exception/NetworkException.java b/w11/datacollect/exception/NetworkException.java new file mode 100644 index 0000000..1b11f31 --- /dev/null +++ b/w11/datacollect/exception/NetworkException.java @@ -0,0 +1,19 @@ +package com.example.datacollect.exception; + +public class NetworkException extends CrawlerException{ + public NetworkException() { + super(); + } + + public NetworkException(String message) { + super(message); + } + + public NetworkException(String message, Throwable cause) { + super(message, cause); + } + + public NetworkException(Throwable cause) { + super(cause); + } +} diff --git a/w11/datacollect/exception/ParseException.java b/w11/datacollect/exception/ParseException.java new file mode 100644 index 0000000..2e6d499 --- /dev/null +++ b/w11/datacollect/exception/ParseException.java @@ -0,0 +1,19 @@ +package com.example.datacollect.exception; + +public class ParseException extends CrawlerException{ + public ParseException() { + super(); + } + + public ParseException(String message) { + super(message); + } + + public ParseException(String message, Throwable cause) { + super(message, cause); + } + + public ParseException(Throwable cause) { + super(cause); + } +} diff --git a/w11/datacollect/model/Article.java b/w11/datacollect/model/Article.java new file mode 100644 index 0000000..147dbe6 --- /dev/null +++ b/w11/datacollect/model/Article.java @@ -0,0 +1,45 @@ +package com.example.datacollect.model; + +public class Article { + private String title; + private String url; + private String content; + + public Article(String title, String url, String content) { + this.title = title; + this.url = url; + this.content = content; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + @Override + public String toString() { + return "Article{" + + "title='" + title + '\'' + + ", url='" + url + '\'' + + '}'; + } +} diff --git a/w11/datacollect/repository/ArticleRepository.java b/w11/datacollect/repository/ArticleRepository.java new file mode 100644 index 0000000..a8361bf --- /dev/null +++ b/w11/datacollect/repository/ArticleRepository.java @@ -0,0 +1,100 @@ +package com.example.datacollect.repository; + +import com.example.datacollect.model.Article; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class ArticleRepository { + private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class); + private final List
articles = new ArrayList<>(); + + // 新增:根据索引获取文章(防御索引越界) + public Article get(int index) { + logger.debug("Getting article at index: {}", index); + if (index < 0 || index >= articles.size()) { + String errorMsg = "Index out of bounds: index=" + index + ", size=" + articles.size(); + logger.error(errorMsg); + throw new IndexOutOfBoundsException(errorMsg); + } + return articles.get(index); + } + + public void add(Article article) { + logger.debug("Adding article: {}", article); + if (article == null) { + String errorMsg = "Article cannot be null"; + logger.error(errorMsg); + throw new IllegalArgumentException(errorMsg); + } + // 新增:防御重复添加(可选) + if (articles.contains(article)) { + logger.warn("Article already exists: {}", article); + return; + } + articles.add(article); + logger.info("Added article: {}", article.getTitle()); + } + + // ★ 新增:批量添加方法以及注意防御 null + public void addAll(List
articles) { + logger.debug("Adding batch articles, size: {}", articles == null ? "null" : articles.size()); + if (articles == null) { + String errorMsg = "Articles list cannot be null"; + logger.error(errorMsg); + throw new IllegalArgumentException(errorMsg); + } + if (articles.isEmpty()) { + logger.warn("Articles list is empty, skip addAll"); + return; + } + int addedCount = 0; + for (Article article : articles) { + if (article == null) { + logger.error("Skipping null article in batch add"); + continue; // 或抛出异常,根据业务选择 + } + if (!this.articles.contains(article)) { + this.articles.add(article); + addedCount++; + } + } + logger.info("Batch added {} articles (skipped duplicates/null)", addedCount); + } + + public List
getAll() { + List
unmodifiableList = Collections.unmodifiableList(articles); + logger.debug("Getting all articles, size: {}", unmodifiableList.size()); + return unmodifiableList; + } + + public int size() { + int size = articles.size(); + logger.debug("Repository size: {}", size); + return size; + } + + // 新增:清空前校验 + 日志 + public void clear() { + logger.warn("Clearing all articles (current size: {})", articles.size()); + if (articles.isEmpty()) { + logger.info("Repository is already empty, skip clear"); + return; + } + articles.clear(); + logger.info("Cleared all articles successfully"); + } + + // 新增:检查是否包含指定URL的文章(防御检查) + public boolean containsUrl(String url) { + logger.debug("Checking if repository contains url: {}", url); + if (url == null || url.isBlank()) { + logger.error("URL cannot be null/blank"); + throw new IllegalArgumentException("URL cannot be null or blank"); + } + return articles.stream().anyMatch(article -> url.equals(article.getUrl())); + } +} diff --git a/w11/datacollect/strategy/BlogStrategy.java b/w11/datacollect/strategy/BlogStrategy.java new file mode 100644 index 0000000..1d2bacd --- /dev/null +++ b/w11/datacollect/strategy/BlogStrategy.java @@ -0,0 +1,47 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class BlogStrategy implements CrawlStrategy { + private static final Logger logger = LoggerFactory.getLogger(BlogStrategy.class); + private static final Pattern URL_PATTERN = Pattern.compile(".*blog\\.example\\.com.*"); + + @Override + public boolean supports(String url) { + boolean isSupported = URL_PATTERN.matcher(url).matches(); + logger.debug("URL {} support status: {}", url, isSupported); + return isSupported; + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + try { + logger.info("Start parsing blog articles from url: {}", url); + List
articles = new ArrayList<>(); + Elements titles = doc.select(".post-title"); + for (Element e : titles) { + articles.add(new Article(e.text(), url, "")); + } + logger.debug("Parsed {} blog articles from url: {}", articles.size(), url); + return articles; + } catch (Exception e) { + logger.error("Failed to parse blog articles from url: {}", url, e); + throw new ParseException("Blog article parse failed for url: " + url, e); + } + } + + @Override + public int getPriority() { + return 10; // 优先级高于默认策略 + } +} diff --git a/w11/datacollect/strategy/CrawlStrategy.java b/w11/datacollect/strategy/CrawlStrategy.java new file mode 100644 index 0000000..8a905ff --- /dev/null +++ b/w11/datacollect/strategy/CrawlStrategy.java @@ -0,0 +1,15 @@ +package com.example.datacollect.strategy; +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import java.util.List; + +public interface CrawlStrategy { + List
parse(String url, Document doc) throws ParseException; + boolean supports(String url); + //增加优先级 + default int getPriority(){ + return 0; + } + +} diff --git a/w11/datacollect/strategy/DefaultStrategy.java b/w11/datacollect/strategy/DefaultStrategy.java new file mode 100644 index 0000000..c13b90d --- /dev/null +++ b/w11/datacollect/strategy/DefaultStrategy.java @@ -0,0 +1,25 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.List; + +public class DefaultStrategy implements CrawlStrategy { + private static final Logger logger = LoggerFactory.getLogger(DefaultStrategy.class); + + @Override + public boolean supports(String url) { + logger.debug("默认策略支持所有 URL:{}", url); + return true; + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + logger.info("使用默认策略解析:{}", url); + // 你的解析逻辑 + return List.of(); + } +} \ No newline at end of file diff --git a/w11/datacollect/strategy/HnuNewsStrategy.java b/w11/datacollect/strategy/HnuNewsStrategy.java new file mode 100644 index 0000000..a66b20d --- /dev/null +++ b/w11/datacollect/strategy/HnuNewsStrategy.java @@ -0,0 +1,95 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class HnuNewsStrategy implements CrawlStrategy { + + // 1. 添加 Logger 成员 + private static final Logger logger = LoggerFactory.getLogger(HnuNewsStrategy.class); + + // 2. 修正 URL 匹配逻辑(原逻辑仅匹配域名,建议增加路径灵活性) + private static final Pattern URL_PATTERN = Pattern.compile(".*news\\.hnu\\.edu\\.cn.*"); + + @Override + public boolean supports(String url) { + return URL_PATTERN.matcher(url).matches(); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + + // 原有逻辑:尝试选择列表项 + // 注意:根据2026年5月的网页结构,实际可能需要调整为 div 或其他容器 + Elements listItems = doc.select("ul.list11 li"); + + if (listItems.isEmpty()) { + logger.warn("在 URL [{}] 中未找到符合选择器 'ul.list11 li' 的新闻列表项。可能网页结构已更新。", url); + return articles; + } + + for (Element li : listItems) { + Element link = li.selectFirst("a"); + if (link == null) { + logger.debug("跳过一个无链接的列表项: {}", li.toString()); + continue; + } + + String articleUrl = link.attr("href"); + // 3. 修正 URL 拼接逻辑(原逻辑 replace("..") 可能不够健壮) + if (!articleUrl.startsWith("http")) { + // 使用 URI 或简单的字符串处理来规范化路径 + articleUrl = "https://news.hnu.edu.cn/" + articleUrl; + // 这里简单处理,实际可能需要更复杂的路径规范化 + while (articleUrl.contains("/../")) { + int index = articleUrl.indexOf("/../"); + int prevSlash = articleUrl.lastIndexOf('/', index - 1); + if (prevSlash != -1) { + articleUrl = articleUrl.substring(0, prevSlash) + articleUrl.substring(index + 3); + } else { + break; + } + } + } + + String title = ""; + Element titleEl = link.selectFirst("h4.l2.h4s2"); + if (titleEl != null) { + title = titleEl.text().trim(); + } else { + logger.debug("在链接 [{}] 中未找到标题元素 h4.l2.h4s2", articleUrl); + } + + String content = ""; + Element contentEl = link.selectFirst("p.l3.ps3"); + if (contentEl != null) { + content = contentEl.text().trim(); + } + // 不再输出空内容警告,因 content 可能为空 + + if (!title.isEmpty()) { + articles.add(new Article(title, articleUrl, content)); + logger.debug("解析到新闻条目: [标题] {} - [URL] {}", title, articleUrl); + } else { + logger.trace("跳过空标题的链接: {}", articleUrl); + } + } + + logger.info("成功解析 URL [{}],共提取 {} 篇新闻。", url, articles.size()); + return articles; + } + + @Override + public int getPriority() { + return 15; + } +} \ No newline at end of file diff --git a/w11/datacollect/strategy/NewsStrategy.java b/w11/datacollect/strategy/NewsStrategy.java new file mode 100644 index 0000000..b8f2e08 --- /dev/null +++ b/w11/datacollect/strategy/NewsStrategy.java @@ -0,0 +1,57 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class NewsStrategy implements CrawlStrategy { + + // 1. 添加 Logger 成员 + private static final Logger logger = LoggerFactory.getLogger(NewsStrategy.class); + + // 使用正则匹配 + private static final Pattern URL_PATTERN = Pattern.compile(".*news\\.example\\.com.*"); + + @Override + public boolean supports(String url) { + return URL_PATTERN.matcher(url).matches(); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + + // 2. 添加解析过程日志 + logger.debug("开始解析 URL: [{}]", url); + + Elements items = doc.select(".article-headline"); + + if (items.isEmpty()) { + logger.warn("在 URL [{}] 中未找到符合选择器 '.article-headline' 的文章标题元素。", url); + return articles; + } + + for (Element e : items) { + String title = e.text().trim(); + if (!title.isEmpty()) { + articles.add(new Article(title, url, "")); + logger.trace("提取到文章标题: {}", title); + } + } + + logger.info("成功解析 URL [{}],共提取 {} 篇文章。", url, articles.size()); + return articles; + } + + @Override + public int getPriority() { + return 10; + } +} diff --git a/w11/datacollect/strategy/StrategyFactory.java b/w11/datacollect/strategy/StrategyFactory.java new file mode 100644 index 0000000..df301ee --- /dev/null +++ b/w11/datacollect/strategy/StrategyFactory.java @@ -0,0 +1,29 @@ +package com.example.datacollect.strategy; + +import java.util.ArrayList; +import java.util.List; + +public class StrategyFactory { + private final List strategies = new ArrayList<>(); + + public StrategyFactory() { + strategies.add(new HnuNewsStrategy()); + strategies.add(new BlogStrategy()); + strategies.add(new NewsStrategy()); + //注册默认策略 + strategies.add(new DefaultStrategy()); + } + + public CrawlStrategy getStrategy(String url) { + //按优先级降序排序 + return strategies.stream() + .sorted((s1, s2) -> Integer.compare(s2.getPriority(), s1.getPriority())) + .filter(s -> s.supports(url)) + .findFirst() + .orElse(null); // 如果默认策略未匹配到,返回 null 或默认策略本身 + } + + public void register(CrawlStrategy strategy) { + strategies.add(strategy); + } +} \ No newline at end of file diff --git a/w11/datacollect/view/ConsoleView.java b/w11/datacollect/view/ConsoleView.java new file mode 100644 index 0000000..5ab8fd3 --- /dev/null +++ b/w11/datacollect/view/ConsoleView.java @@ -0,0 +1,56 @@ +package com.example.datacollect.view; + +import com.example.datacollect.model.Article; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.Scanner; + +public class ConsoleView { + private static final Logger logger = LoggerFactory.getLogger(ConsoleView.class); + private static final String ANSI_RESET = "\u001B[0m"; + private static final String ANSI_GREEN = "\u001B[32m"; + private static final String ANSI_RED = "\u001B[31m"; + private static final String ANSI_BLUE = "\u001B[34m"; + + private final Scanner scanner = new Scanner(System.in); + + public String readLine() { + System.out.print("> "); + String input = scanner.nextLine(); + logger.debug("User input: {}", input); + return input; + } + + public void printSuccess(String msg) { + logger.info(msg); + System.out.println(ANSI_GREEN + msg + ANSI_RESET); + } + + public void printError(String msg) { + logger.error(msg); + System.out.println(ANSI_RED + msg + ANSI_RESET); + } + + public void printInfo(String msg) { + logger.info(msg); + System.out.println(ANSI_BLUE + msg + ANSI_RESET); + } + + public void display(List
articles) { + logger.debug("Displaying {} articles", articles.size()); + if (articles.isEmpty()) { + String emptyMsg = "暂无文章,请先执行 crawl。"; + logger.info(emptyMsg); + printInfo(emptyMsg); + return; + } + for (int i = 0; i < articles.size(); i++) { + Article a = articles.get(i); + String articleStr = (i + 1) + ". " + a.getTitle() + " | " + a.getUrl(); + System.out.println(articleStr); + logger.debug(articleStr); + } + } +} diff --git a/w11/logback.xml b/w11/logback.xml new file mode 100644 index 0000000..377c362 --- /dev/null +++ b/w11/logback.xml @@ -0,0 +1,39 @@ + + + + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} - %msg%n + UTF-8 + + + + + + logs/crawler.log + + logs/crawler.%d{yyyy-MM-dd}.log + 7 + 100MB + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} - %msg%n + UTF-8 + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/w11/pom.xml b/w11/pom.xml new file mode 100644 index 0000000..636a5d7 --- /dev/null +++ b/w11/pom.xml @@ -0,0 +1,96 @@ + + + 4.0.0 + com.example + datacollect-cli + 0.1.0 + jar + + + 11 + 11 + UTF-8 + + 2.0.9 + 1.4.14 + + + + + + aliyun + Aliyun Maven + https://maven.aliyun.com/repository/public + + true + + + true + + + + + + + org.jsoup + jsoup + 1.17.2 + + + + + org.slf4j + slf4j-api + ${slf4j.version} + + + + + ch.qos.logback + logback-classic + ${logback.version} + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + ${maven.compiler.source} + ${maven.compiler.target} + + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.3.0 + + + + com.example.datacollect.Main + + + + jar-with-dependencies + + + + + make-assembly + package + + single + + + + + + + \ No newline at end of file